def buildAndTestModel(): houseTrain, inputCols, outputCol = reading.readData("data/train.csv") inputCols.remove('Id') houseTest = reading.readTestData("data/test.csv") preprocessing.manageNAValues(houseTrain, inputCols) preprocessing.preprocess(houseTest, houseTrain, inputCols) preprocessing.preprocess(houseTrain, houseTrain, inputCols) alg = GradientBoostingRegressor(random_state=1, n_estimators=800) alg.fit(houseTrain.loc[:, inputCols], houseTrain.loc[:, outputCol]) predictions = alg.predict(houseTest.loc[:, inputCols]) submitDF = pd.DataFrame({ "Id": houseTest.loc[:, "Id"], "SalePrice": predictions }) submitDF.to_csv("data/submission.csv", index=False)
def buildAndTestModel(): houseTrain, inputCols, outputCol = reading.readData() preprocessing.preprocess(houseTrain, houseTrain, inputCols) alg = GradientBoostingRegressor(random_state=1, n_estimators=800) cvScores = model_selection.cross_val_score(alg, houseTrain.loc[:, inputCols], houseTrain.loc[:, outputCol], cv=10, scoring='r2') print(np.mean(cvScores)) houseTest = pd.read_csv("data/test.csv") preprocessing.preprocess(houseTest, houseTrain, inputCols) alg = GradientBoostingRegressor(random_state=1, n_estimators=800) alg.fit(houseTrain.loc[:, inputCols], houseTrain.loc[:, outputCol]) predictions = alg.predict(houseTest.loc[:, inputCols]) submitDF = pd.DataFrame({ "Row": houseTest.loc[:, "SalePrice"], "Prediction": predictions }) submitDF.to_csv("data/submission.csv", index=False)
def main(): trainDF, inputsCol, outputCol = reading.readData() preprocessing.preprocess(trainDF, trainDF, inputsCol) alg = GradientBoostingRegressor( random_state=1, n_estimators=800 ) # accuracy does not change everytime it is run with set random_state # additionalCols = ['Attic', 'Finished', 'Split', 'Foyer', 'Duplex', 'Pud', 'Conversion', 'Story'] # inputsCol = inputsCol + additionalCols # inputsCol = list(set(inputsCol) - set(['MSSubClass'])) cvScores = model_selection.cross_val_score(alg, trainDF.loc[:, inputsCol], trainDF.loc[:, outputCol], cv=10, scoring='r2') print("Highest Accuracy with all features, default parameterizations =", np.mean(cvScores)) # inputsCol = additionalCols + [outputCol] # visualization.visualize(trainDF, additionalCols, outputCol) #visualization.visualize(trainDF,inputsCol,outputCol) #Already done, testing what happens to accuracy removing one feature at a time """ inputsColTemp = copy.deepcopy(inputsCol) temp = {} temp["Nothing removed"] = np.mean(cvScores) while len(inputsColTemp) != 0: featureRemoved = inputsColTemp.pop() inputsCol.remove(featureRemoved) alg = GradientBoostingRegressor(random_state = 1, n_estimators = 800) cvScores = model_selection.cross_val_score(alg, trainDF.loc[:, inputsCol], trainDF.loc[:, outputCol], cv=10, scoring='r2') temp[featureRemoved] = np.mean(cvScores) print("Accuracy when removing " + featureRemoved + " =", np.mean(cvScores)) inputsCol.append(featureRemoved) export = pd.Series(temp) export.to_csv(os.getcwd() + '/removeOneFeature_postParameterization.csv') """ #Testing various parameterizations ''' print("Changing n:") i = 100 while i<=1000: alg = GradientBoostingRegressor(random_state = 1, n_estimators = i) cvScores = model_selection.cross_val_score(alg, trainDF.loc[:, inputsCol], trainDF.loc[:, outputCol], cv=10, scoring='r2') print("For score of i = " + str(i) + ":", np.mean(cvScores)) i += 100 ''' ''' print("Changing learning rate:") j = 0.01 while j<=1: alg = GradientBoostingRegressor(random_state = 1, learning_rate = j) cvScores = model_selection.cross_val_score(alg, trainDF.loc[:, inputsCol], trainDF.loc[:, outputCol], cv=10, scoring='r2') print("For rate of j = " + str(j) + ":", np.mean(cvScores)) j += 0.05 ''' """ print("Testing proposed optimum settings (approximate):") alg = GradientBoostingRegressor(random_state = 1, n_estimators = 900, learning_rate = 0.16) cvScores = model_selection.cross_val_score(alg, trainDF.loc[:, inputsCol], trainDF.loc[:, outputCol], cv=10, scoring='r2') print("Score:", np.mean(cvScores)) """ """ For score of i = 100: 0.9695838362322334 For score of i = 200: 0.9713926415403247 For score of i = 300: 0.97218152251929 For score of i = 400: 0.9725433964858627 For score of i = 500: 0.9727129864730497 For score of i = 600: 0.9727150286009545 For score of i = 700: 0.9727550478839996 For score of i = 800: 0.9728148128730638 For score of i = 900: 0.9727879075157484 For score of i = 1000: 0.9727582470818399 """ """ Changing learning rate: For rate of j = 0.01: 0.7866244576707975 For rate of j = 0.060000000000000005: 0.9673647242443091 For rate of j = 0.11000000000000001: 0.9685577086455324 For rate of j = 0.16000000000000003: 0.9706671931416834 For rate of j = 0.21000000000000002: 0.9704141690215575 For rate of j = 0.26: 0.9693811639251051 For rate of j = 0.31: 0.9640193537044937 For rate of j = 0.36: 0.9662721202679398 For rate of j = 0.41: 0.9650110713531361 For rate of j = 0.45999999999999996: 0.9598916595713343 For rate of j = 0.51: 0.9612472671240511 For rate of j = 0.56: 0.9582289549106051 For rate of j = 0.6100000000000001: 0.9532347135301027 For rate of j = 0.6600000000000001: 0.9568633720893616 For rate of j = 0.7100000000000002: 0.951672708291688 For rate of j = 0.7600000000000002: 0.9524549341492469 For rate of j = 0.8100000000000003: 0.9461558271946371 For rate of j = 0.8600000000000003: 0.9354096980604721 For rate of j = 0.9100000000000004: 0.9328281792809602 For rate of j = 0.9600000000000004: 0.9325540177700384 """ #Optimum for both at the same time = 0.9708920005637681... interesting... so looks like better to focus on n_estimators #Testing combinations of removing items # without ["HalfBath", "LandSlope", "BldgType", "YearBuilt", "LowQualFinSF", "Utilities"] 0.9697767256899044 # without ["HalfBath", "LandSlope", "BldgType", "YearBuilt", "LowQualFinSF"] 0.9689720298816091 # without ["HalfBath", "LandSlope", "BldgType", "YearBuilt"] 0.9689965270724571 # without ["HalfBath", "LandSlope", "BldgType"] 0.9701461139168301 # without ["HalfBath", "LandSlope"] 0.968366504937428 # without ["HalfBath"] 0.968366504937428 # TODO It seems even when deleting one column improves the result, removing the multiple columns worsen the accuracy #Test function for above accuracies #itemsToRemove = set(["HalfBath"]) # changed to ScreenPorch from ScreenProch # TODO Why does the result different from the result in the while loop above? This should match up with the accuracy 0.97021332160605. Does the order of list matter?? #post_featureRemoval = filter(lambda x: x not in itemsToRemove, inputsCol) # post_featureRemoval = list(set(inputsCol) - itemsToRemove) # inputsCol.remove('HalfBath') #alg = GradientBoostingRegressor(random_state = 1) #cvScores = model_selection.cross_val_score(alg, trainDF.loc[:, post_featureRemoval], trainDF.loc[:, outputCol], cv=10, scoring='r2') #print("After removing all detrimental features =", np.mean(cvScores)) # This results in a lower value... does this mean some of these are related, or simply need preprocessing # (e.g., year built should probably become age)? #visualization.visualize(trainDF,inputsCol,outputCol) #Testing a different model (linear regression) """
def visualizationTest(): trainDF, inputsCol, outputCol = reading.readData() inputsCol = ['MSSubClass'] visualization.visualizeScatterplot(trainDF, inputsCol, outputCol)