def Blend(): trainBase = csv_io.read_data("PreProcessData/training_PreProcess4.csv", skipFirstLine = False, split = "\t") weights = csv_io.read_data("PreProcessData/Weights.csv", skipFirstLine = False) SEED = 448 #random.seed(SEED) #random.shuffle(trainBase) target = [x[0] for x in trainBase] dataset_blend_train, dataset_blend_test = stack.run_stack(SEED) clfs = [LinearRegression(fit_intercept=True, normalize=False, copy_X=True) ] # clfs = [LogisticRegression(penalty='l2', dual=False, tol=0.0001, C=1.0, fit_intercept=True, intercept_scaling=1, class_weight=None), # LogisticRegression(penalty='l2', dual=False, tol=0.0001, C=0.5, fit_intercept=True, intercept_scaling=1, class_weight=None), # LogisticRegression(penalty='l2', dual=False, tol=0.0001, C=0.1, fit_intercept=True, intercept_scaling=1, class_weight=None), # LogisticRegression(penalty='l1', dual=False, tol=0.0001, C=1.0, fit_intercept=True, intercept_scaling=1, class_weight=None), # LogisticRegression(penalty='l1', dual=False, tol=0.0001, C=0.5, fit_intercept=True, intercept_scaling=1, class_weight=None), # LogisticRegression(penalty='l1', dual=False, tol=0.0001, C=0.1, fit_intercept=True, intercept_scaling=1, class_weight=None)] test = csv_io.read_data("PreProcessData/test_PreProcess4.csv", False) dataset_blend_test_set = np.zeros((len(test), len(clfs))) for ExecutionIndex, clf in enumerate(clfs): clf.fit(dataset_blend_train, target) submission = clf.predict(dataset_blend_test) submission = ["%f" % x for x in submission] now = datetime.datetime.now() csv_io.write_delimited_file("../Submissions/BlendSingle" + now.strftime("%Y%m%d%H%M%S") + ".csv", submission) # attempt to score the training set to predict score for blend... probSum = 0.0 weightSum = 0 trainPrediction = clf.predict(dataset_blend_train) for i in range(0, len(trainPrediction)): probX = trainPrediction[i] probSum += weights[i][0] * math.fabs(target[i] - probX) weightSum += weights[i][0] #probSum += int(target[i])*log(probX)+(1-int(target[i]))*log(1-probX) print "Train Score: ", (probSum/weightSum) dataset_blend_test_set[:, ExecutionIndex] = submission csv_io.write_delimited_file_single("../Submissions/FinalBlend_" + now.strftime("%Y%m%d%H%M%S") + ".csv", dataset_blend_test_set.mean(1))
def Blend(): trainBase = csv_io.read_data("PreProcessData/training_PreProcess4.csv", skipFirstLine=False, split="\t") weights = csv_io.read_data("PreProcessData/Weights.csv", skipFirstLine=False) SEED = 448 #random.seed(SEED) #random.shuffle(trainBase) target = [x[0] for x in trainBase] dataset_blend_train, dataset_blend_test = stack.run_stack(SEED) clfs = [LinearRegression(fit_intercept=True, normalize=False, copy_X=True)] # clfs = [LogisticRegression(penalty='l2', dual=False, tol=0.0001, C=1.0, fit_intercept=True, intercept_scaling=1, class_weight=None), # LogisticRegression(penalty='l2', dual=False, tol=0.0001, C=0.5, fit_intercept=True, intercept_scaling=1, class_weight=None), # LogisticRegression(penalty='l2', dual=False, tol=0.0001, C=0.1, fit_intercept=True, intercept_scaling=1, class_weight=None), # LogisticRegression(penalty='l1', dual=False, tol=0.0001, C=1.0, fit_intercept=True, intercept_scaling=1, class_weight=None), # LogisticRegression(penalty='l1', dual=False, tol=0.0001, C=0.5, fit_intercept=True, intercept_scaling=1, class_weight=None), # LogisticRegression(penalty='l1', dual=False, tol=0.0001, C=0.1, fit_intercept=True, intercept_scaling=1, class_weight=None)] test = csv_io.read_data("PreProcessData/test_PreProcess4.csv", False) dataset_blend_test_set = np.zeros((len(test), len(clfs))) for ExecutionIndex, clf in enumerate(clfs): clf.fit(dataset_blend_train, target) submission = clf.predict(dataset_blend_test) submission = ["%f" % x for x in submission] now = datetime.datetime.now() csv_io.write_delimited_file( "../Submissions/BlendSingle" + now.strftime("%Y%m%d%H%M%S") + ".csv", submission) # attempt to score the training set to predict score for blend... probSum = 0.0 weightSum = 0 trainPrediction = clf.predict(dataset_blend_train) for i in range(0, len(trainPrediction)): probX = trainPrediction[i] probSum += weights[i][0] * math.fabs(target[i] - probX) weightSum += weights[i][0] #probSum += int(target[i])*log(probX)+(1-int(target[i]))*log(1-probX) print "Train Score: ", (probSum / weightSum) dataset_blend_test_set[:, ExecutionIndex] = submission csv_io.write_delimited_file_single( "../Submissions/FinalBlend_" + now.strftime("%Y%m%d%H%M%S") + ".csv", dataset_blend_test_set.mean(1))
def run_stack(SEED): model = "Long-Lat KNN5 - 50 Features" print "Running GB, RF, ET stack." trainBase = csv_io.read_data("PreProcessData/training_PreProcess4_50.csv", skipFirstLine = False, split = "\t") test = csv_io.read_data("PreProcessData/test_PreProcess4_50.csv", skipFirstLine = False, split = "\t") weights = csv_io.read_data("PreProcessData/Weights.csv", skipFirstLine = False) #random.seed(SEED) #random.shuffle(trainBase) avg = 0 NumFolds = 5 # 5 is good, but 10 yeilds a better mean since outliers are less significant. (note, predictions are less reliable when using 10). predicted_list = [] bootstrapLists = [] # use this for quick runs. # note RF with 150 crashes on 30 features # GradientBoostingRegressor(learn_rate=0.05, subsample=0.5, max_depth=6, n_estimators=50), # GradientBoostingRegressor(learn_rate=0.02, subsample=0.2, max_depth=8, n_estimators=125), # RandomForestRegressor(n_estimators=100, n_jobs=1), #RandomForestRegressor(n_estimators=75, n_jobs=1), # clfs = [ExtraTreesRegressor(n_estimators=50, min_density=0.2, n_jobs=1, bootstrap=True, random_state=1), # SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0, degree=3, gamma=2**-5.5, kernel='rbf', probability=False, shrinking=True, tol=0.001, verbose=False) # ] #knn 5 at 3.45 #knn 15 at 3.31 #knn 25 at 3.30 #knn 40 at 3.31 # KNeighborsRegressor(n_neighbors=5, weights='uniform', algorithm='auto', leaf_size=30, warn_on_equidistant=False, p=2), # KNeighborsRegressor(n_neighbors=15, weights='uniform', algorithm='auto', leaf_size=30, warn_on_equidistant=False, p=2), # KNeighborsRegressor(n_neighbors=25, weights='uniform', algorithm='auto', leaf_size=30, warn_on_equidistant=False, p=2), # LinearRegression at 3.77 # Ridge at 3.77 # SGD 4.23 #Gauss at 13 # LinearRegression(fit_intercept=True, normalize=False, copy_X=True), # Ridge(alpha=1.0, fit_intercept=True, normalize=False, copy_X=True, tol=0.001), # SGDRegressor(loss='squared_loss', penalty='l2', alpha=0.0001, rho=0.84999999999999998, fit_intercept=True, n_iter=5, shuffle=False, verbose=0, epsilon=0.10000000000000001, p=None, seed=0, learning_rate='invscaling', eta0=0.01, power_t=0.25, warm_start=False), # GaussianNB() # clfs = [KNeighborsRegressor(n_neighbors=15, weights='uniform', algorithm='auto', leaf_size=30, warn_on_equidistant=False, p=2), # KNeighborsRegressor(n_neighbors=25, weights='uniform', algorithm='auto', leaf_size=30, warn_on_equidistant=False, p=2),KNeighborsRegressor(n_neighbors=35, weights='uniform', algorithm='auto', leaf_size=30, warn_on_equidistant=False, p=2) # ] # GB, 125 est is minimum, score is bad below this, explore higher and other dimensions. ****************** # clfs = [GradientBoostingRegressor(learn_rate=0.05, subsample=0.5, max_depth=6, n_estimators=100, random_state=166), # GradientBoostingRegressor(learn_rate=0.05, subsample=0.5, max_depth=8, n_estimators=100, random_state=166), # GradientBoostingRegressor(learn_rate=0.05, subsample=0.5, max_depth=10, n_estimators=100, random_state=166), # GradientBoostingRegressor(learn_rate=0.05, subsample=0.5, max_depth=6, n_estimators=200, random_state=166), # GradientBoostingRegressor(learn_rate=0.05, subsample=0.5, max_depth=8, n_estimators=200, random_state=166),GradientBoostingRegressor(learn_rate=0.05, subsample=0.5, max_depth=10, n_estimators=200, random_state=166) # ] # about 1 hour run time, and 3.10 score. #GradientBoostingRegressor(learn_rate=0.05, subsample=0.5, max_depth=6, n_estimators=400, random_state=166) # about 2 hours run time at 3.05 #GradientBoostingRegressor(learn_rate=0.05, subsample=0.5, max_depth=8, n_estimators=400, random_state=166) # about 2 hours run time at 3.06 #GradientBoostingRegressor(learn_rate=0.05, subsample=0.5, max_depth=6, n_estimators=800, random_state=166) # about 4 hours run time at 3.06 #GradientBoostingRegressor(learn_rate=0.05, subsample=0.5, max_depth=8, n_estimators=800, random_state=166) clfs = [GradientBoostingRegressor(learn_rate=0.05, subsample=0.5, max_depth=6, n_estimators=3000, random_state=166) ] # use this for quick runs. # clfs = [GradientBoostingClassifier(learn_rate=0.05, subsample=0.5, max_depth=6, n_estimators=50, random_state=166), # GradientBoostingClassifier(learn_rate=0.02, subsample=0.2, max_depth=8, n_estimators=125, random_state=551), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=80, random_state=441), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=3, n_estimators=80, random_state=331), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=4, n_estimators=80, random_state=221), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=120, random_state=91), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=3, n_estimators=120, random_state=81), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=4, n_estimators=120, random_state=71), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=160, random_state=61), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=3, n_estimators=160, random_state=51), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=4, n_estimators=160, random_state=41), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=1, n_estimators=200, random_state=31), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=200, random_state=21), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=3, n_estimators=200, random_state=10), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=4, n_estimators=200, random_state=19), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=1, n_estimators=240, random_state=18), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=240, random_state=17), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=3, n_estimators=240, random_state=16), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=1, n_estimators=280, random_state=15), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=280, random_state=14), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=3, n_estimators=280, random_state=13), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=1, n_estimators=320, random_state=12), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=320, random_state=11), # RandomForestClassifier(n_estimators=100, n_jobs=1, criterion='gini'), # RandomForestClassifier(n_estimators=100, n_jobs=1, criterion='entropy'), # ExtraTreesClassifier(n_estimators=150, min_density=0.2, n_jobs=1, criterion='gini', bootstrap=True, random_state=1), # ExtraTreesClassifier(n_estimators=150, min_density=0.2, n_jobs=1, criterion='entropy', bootstrap=True, random_state=5)] # use this for quick runs. reduced estimators to 50 # clfs = [SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0, degree=3, # gamma=2**-5.5, kernel='rbf', probability=False, shrinking=True, # tol=0.001, verbose=False) # ] #GradientBoostingRegressor(learn_rate=0.05, subsample=0.5, max_depth=6, n_estimators=50), #ExtraTreesRegressor(n_estimators=50, min_density=0.2, n_jobs=1, bootstrap=True, random_state=1) # clfs = [ExtraTreesClassifier(n_estimators=150, min_density=0.2, n_jobs=1, criterion='gini', bootstrap=True, random_state=1), # ExtraTreesClassifier(n_estimators=150, min_density=0.09, n_jobs=1, criterion='gini', bootstrap=True, random_state=2), # ExtraTreesClassifier(n_estimators=150, min_density=0.2, n_jobs=1, criterion='entropy', bootstrap=True, random_state=5), # ExtraTreesClassifier(n_estimators=150, min_density=0.09, n_jobs=1, criterion='entropy', bootstrap=True, random_state=6), # GradientBoostingClassifier(learn_rate=0.05, subsample=0.5, max_depth=6, n_estimators=50), # GradientBoostingClassifier(learn_rate=0.02, subsample=0.2, max_depth=8, n_estimators=125), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=80), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=3, n_estimators=80), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=4, n_estimators=80), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=120), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=3, n_estimators=120), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=4, n_estimators=120), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=160), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=3, n_estimators=160), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=4, n_estimators=160), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=1, n_estimators=200), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=200), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=3, n_estimators=200), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=4, n_estimators=200), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=1, n_estimators=240), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=240), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=3, n_estimators=240), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=1, n_estimators=280), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=280), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=3, n_estimators=280), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=1, n_estimators=320), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=320), # RandomForestClassifier(n_estimators=150, min_density=0.2, n_jobs=1, criterion='gini', bootstrap=True, random_state=1), # RandomForestClassifier(n_estimators=150, min_density=0.09, n_jobs=1, criterion='gini', bootstrap=True, random_state=2), # RandomForestClassifier(n_estimators=150, min_density=0.2, n_jobs=1, criterion='entropy', bootstrap=True, random_state=5), # RandomForestClassifier(n_estimators=150, min_density=0.05, n_jobs=1, criterion='entropy', bootstrap=True, random_state=7)] # full algorithm stack. # clfs = [ExtraTreesClassifier(n_estimators=150, min_density=0.2, n_jobs=1, criterion='gini', bootstrap=True, random_state=1), # ExtraTreesClassifier(n_estimators=150, min_density=0.09, n_jobs=1, criterion='gini', bootstrap=True, random_state=2), # ExtraTreesClassifier(n_estimators=150, min_density=0.05, n_jobs=1, criterion='gini', bootstrap=True, random_state=3), # ExtraTreesClassifier(n_estimators=150, min_density=0.02, n_jobs=1, criterion='gini', bootstrap=True, random_state=4), # ExtraTreesClassifier(n_estimators=150, min_density=0.2, n_jobs=1, criterion='entropy', bootstrap=True, random_state=5), # ExtraTreesClassifier(n_estimators=150, min_density=0.09, n_jobs=1, criterion='entropy', bootstrap=True, random_state=6), # ExtraTreesClassifier(n_estimators=150, min_density=0.05, n_jobs=1, criterion='entropy', bootstrap=True, random_state=7), # ExtraTreesClassifier(n_estimators=150, min_density=0.02, n_jobs=1, criterion='entropy', bootstrap=True, random_state=8), # GradientBoostingClassifier(learn_rate=0.05, subsample=0.5, max_depth=6, n_estimators=50), # GradientBoostingClassifier(learn_rate=0.02, subsample=0.2, max_depth=8, n_estimators=125), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=80), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=3, n_estimators=80), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=4, n_estimators=80), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=120), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=3, n_estimators=120), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=4, n_estimators=120), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=160), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=3, n_estimators=160), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=4, n_estimators=160), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=1, n_estimators=200), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=200), # GradientBoostingClassifier(lesarn_rate=0.01, subsample=0.2, max_depth=3, n_estimators=200), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=4, n_estimators=200), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=1, n_estimators=240), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=240), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=3, n_estimators=240), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=1, n_estimators=280), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=280), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=3, n_estimators=280), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=1, n_estimators=320), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=320), # RandomForestClassifier(n_estimators=150, min_density=0.2, n_jobs=1, criterion='gini', bootstrap=True, random_state=1), # RandomForestClassifier(n_estimators=150, min_density=0.09, n_jobs=1, criterion='gini', bootstrap=True, random_state=2), # RandomForestClassifier(n_estimators=150, min_density=0.05, n_jobs=1, criterion='gini', bootstrap=True, random_state=3), # RandomForestClassifier(n_estimators=150, min_density=0.02, n_jobs=1, criterion='gini', bootstrap=True, random_state=4), # RandomForestClassifier(n_estimators=150, min_density=0.2, n_jobs=1, criterion='entropy', bootstrap=True, random_state=5), # RandomForestClassifier(n_estimators=150, min_density=0.09, n_jobs=1, criterion='entropy', bootstrap=True, random_state=6), # RandomForestClassifier(n_estimators=150, min_density=0.05, n_jobs=1, criterion='entropy', bootstrap=True, random_state=7), # RandomForestClassifier(n_estimators=150, min_density=0.02, n_jobs=1, criterion='entropy', bootstrap=True, random_state=8)] print "Data size: ", len(trainBase), len(test) dataset_blend_train = np.zeros((len(trainBase), len(clfs))) dataset_blend_test = np.zeros((len(test), len(clfs))) trainNew = [] trainTestNew = [] testNew = [] trainNewSelect = [] trainTestNewSelect = [] testNewSelect = [] print "Scaling" targetPre = [x[0] for x in trainBase] trainPre = [x[1:] for x in trainBase] testPre = [x[0:] for x in test] #print trainPre[0] scaler = preprocessing.Scaler().fit(trainPre) trainScaled = scaler.transform(trainPre) testScaled = scaler.transform(testPre) #print scaler.mean_ #print scaler.std_ print "Begin Training" for ExecutionIndex, clf in enumerate(clfs): print str(clf) avg = 0 predicted_list = [] dataset_blend_test_set = np.zeros((len(test), NumFolds)) foldCount = 0 #Stratified for classification...[trainBase[i][0] for i in range(len(trainBase))] Folds = cross_validation.KFold(len(trainBase), k=NumFolds, indices=True) for train_index, test_index in Folds: #trainBaseTemp = [trainBase[i] for i in train_index] #target = [x[0] for x in trainBaseTemp] #train = [x[1:] for x in trainBaseTemp] #testBaseTemp = [trainBase[i] for i in test_index] #targetTest = [x[0] for x in testBaseTemp] #trainTest = [x[1:] for x in testBaseTemp] #test = [x[0:] for x in test] target = [targetPre[i] for i in train_index] train = [trainScaled[i] for i in train_index] targetTest = [targetPre[i] for i in test_index] trainTest = [trainScaled[i] for i in test_index] print print "Iteration: ", foldCount print "LEN: ", len(train), len(target) clf.fit(train, target) prob = clf.predict(trainTest) dataset_blend_train[test_index, ExecutionIndex] = prob probSum = 0 weightSum = 0 # totalOffByHalf = 0 # totalPositive = 0 # totalPositiveOffByHalf = 0 # totalPositivePredictions = 0 for i in range(0, len(prob)): probX = prob[i] probSum += weights[test_index[i]][0] * math.fabs(targetTest[i] - probX) weightSum += weights[test_index[i]][0] #print "Weight", weights[test_index[i]][0], "Index: ",i, "Test_Index: ",test_index[i] , "Actual: ", targetTest[i], "Predicted: ", probX # log loss cal #probSum += int(targetTest[i])*log(probX)+(1-int(targetTest[i]))*log(1-probX) # if ( math.fabs(probX - int(targetTest[i])) > 0.5 ): # totalOffByHalf = totalOffByHalf + 1 # if ( int(targetTest[i]) == 1 ): # totalPositive = totalPositive + 1 # if ( int(targetTest[i]) == 1 and probX < 0.5): # totalPositiveOffByHalf = totalPositiveOffByHalf + 1 # if (probX > 0.5): # totalPositivePredictions = totalPositivePredictions + 1 # print # print "Stats:" # print "Total Off By > 0.5 ", totalOffByHalf # print "Total Positive ", totalPositive # print "Total Positive Off By Half ", totalPositiveOffByHalf # print "Total Positive Predictions ", totalPositivePredictions #print -probSum/len(prob) print "Score: ", probSum/weightSum avg += (probSum/weightSum)/NumFolds predicted_probs = clf.predict(testScaled) #predicted_list.append([x[1] for x in predicted_probs]) dataset_blend_test_set[:, foldCount] = predicted_probs #[0] foldCount = foldCount + 1 dataset_blend_test[:,ExecutionIndex] = dataset_blend_test_set.mean(1) #print "Saving NP" #np.savetxt('temp/dataset_blend_test_set.txt', dataset_blend_test_set) #np.savetxt('temp/dataset_blend_test_set.mean.txt', dataset_blend_test_set.mean(1) ) #np.savetxt('temp/dataset_blend_test.txt', dataset_blend_test) #print "Done Saving NP" now = datetime.datetime.now() #print dataset_blend_test_set.mean(1) csv_io.write_delimited_file_single("../predictions_50/Stack_" + now.strftime("%Y%m%d%H%M%S") + "_" + str(avg) + "_" + str(clf)[:12] + ".csv", dataset_blend_test_set.mean(1)) csv_io.write_delimited_file_single("../predictions_50/Target_Stack_" + now.strftime("%Y%m%d%H%M%S") + "_" + str(avg) + "_" + str(clf)[:12] + ".csv", dataset_blend_train[:,ExecutionIndex] ) csv_io.write_delimited_file("../predictions_40/RunLog.csv", [now.strftime("%Y %m %d %H %M %S"), str(avg), str(clf), str(NumFolds), model, "", ""], filemode="a",delimiter=",") print now print "------------------------Average: ", avg #np.savetxt('temp/dataset_blend_train.txt', dataset_blend_train) return dataset_blend_train, dataset_blend_test
print "Score: ", auc avg += auc/NumFolds predicted_probs = clf.predict_proba(finalTestSparse) #predicted_list.append([x[1] for x in predicted_probs]) dataset_blend_test_set[:, foldCount] = predicted_probs[:,1] foldCount = foldCount + 1 #break dataset_blend_test[:,ExecutionIndex] = dataset_blend_test_set.mean(1) now = datetime.datetime.now() csv_io.write_delimited_file_single_plus_index("../predictions/Stack_" + now.strftime("%Y%m%d%H%M%S") + "_" + str(avg) + "_" + str(clf)[:12] + ".csv", dataset_blend_test_set.mean(1)) csv_io.write_delimited_file_single("../predictions/Target_Stack_" + now.strftime("%Y%m%d%H%M%S") + "_" + str(avg) + "_" + str(clf)[:12] + ".csv", dataset_blend_train[:,ExecutionIndex] ) csv_io.write_delimited_file("../predictions/RunLog.csv", [now.strftime("%Y %m %d %H %M %S"), str(avg), str(clf), str(NumFolds), model, "", ""], filemode="a",delimiter=",") print "------------------------Average: ", avg
def run_stack(SEED): model = "" print "Running Stack." avg = 0 NumFolds = 5 # 5 is good, but 10 yeilds a better mean since outliers are less significant. targetX = csv_io.read_data("target.csv", skipFirstLine = False, split = ",") trainBase = csv_io.read_data("train1.csv", skipFirstLine = False, split = ",") #test = csv_io_np.read_data("test1.csv", skipFirstLine = False, split = ",") trainBase = trainBase[0:5000] targetX = targetX[0:5000] train_saleID = csv_io.read_data("train_salesID.csv", skipFirstLine = False, split = ",") test_salesID = csv_io.read_data("test_salesID.csv", skipFirstLine = False, split = ",") predicted_list = [] bootstrapLists = [] clfs = [ GradientBoostingRegressor(loss='ls', learn_rate=0.05, subsample=0.5, max_depth=6, n_estimators=300, random_state=166, min_samples_leaf=1) ] #GradientBoostingRegressor(loss='ls', learn_rate=0.05, subsample=0.5, max_depth=6, n_estimators=300, random_state=166, min_samples_leaf=1) #GradientBoostingRegressor(loss='lad', learn_rate=0.05, subsample=0.5, max_depth=6, n_estimators=3000, random_state=166, min_samples_leaf=1), #GradientBoostingRegressor(loss='huber', learn_rate=0.05, subsample=0.5, max_depth=6, n_estimators=3000, random_state=166, min_samples_leaf=1), print "Data size: ", len(trainBase) , 11573 # len(test) dataset_blend_train = np.zeros((len(trainBase), len(clfs))) #dataset_blend_test = np.zeros((len(test), len(clfs))) dataset_blend_test = np.zeros(11573, len(clfs)) #targetPre = target #[0:5000] #testScaled = test #trainScaled = trainBase #[0:5000] #targetPre = target #[0:5000] #testScaled = test #trainScaled = trainBase #[0:5000] print "Begin Training" lenTrainBase = len(trainBase) #lenTrainBase = len(trainBase[0:5000]) lenTest = 11573 #lenTest = len(test) gc.collect() for ExecutionIndex, clf in enumerate(clfs): print clf avg = 0 predicted_list = [] dataset_blend_test_set = np.zeros((lenTest, NumFolds)) foldCount = 0 #Stratified for classification...[trainBase[i][0] for i in range(len(trainBase))] Folds = cross_validation.KFold(lenTrainBase, k=NumFolds, indices=True) for train_index, test_index in Folds: target = [targetX[i] for i in train_index] train = [trainBase[i] for i in train_index] targetTest = [targetX[i] for i in test_index] trainTest = [trainBase[i] for i in test_index] #target = [targetPre[i] for i in train_index] #train = [trainScaled[i] for i in train_index] #targetTest = [targetPre[i] for i in test_index] #trainTest = [trainScaled[i] for i in test_index] gc.collect() print print "Iteration: ", foldCount print "LEN: ", len(train), len(target) #print train[0] #print target[0] #return print "Start", datetime.datetime.now() clf.fit(train, target) prob = clf.predict(trainTest) print "End ", datetime.datetime.now() dataset_blend_train[test_index, ExecutionIndex] = prob gc.collect() probSum = 0 weightSum = 0 # totalOffByHalf = 0 # totalPositive = 0 # totalPositiveOffByHalf = 0 # totalPositivePredictions = 0 for i in range(0, len(prob)): probX = prob[i] probX = 31100.0 print targetTest[i][0], probX probSum += math.pow(math.log10(targetTest[i][0]) - math.log10(probX), 2) #probSum += weights[test_index[i]][0] * math.fabs(targetTest[i] - probX) #weightSum += weights[test_index[i]][0] #print "Weight", weights[test_index[i]][0], "Index: ",i, "Test_Index: ",test_index[i] , "Actual: ", targetTest[i], "Predicted: ", probX # log loss cal #probSum += int(targetTest[i])*log(probX)+(1-int(targetTest[i]))*log(1-probX) # if ( math.fabs(probX - int(targetTest[i])) > 0.5 ): # totalOffByHalf = totalOffByHalf + 1 # if ( int(targetTest[i]) == 1 ): # totalPositive = totalPositive + 1 # if ( int(targetTest[i]) == 1 and probX < 0.5): # totalPositiveOffByHalf = totalPositiveOffByHalf + 1 # if (probX > 0.5): # totalPositivePredictions = totalPositivePredictions + 1 # print # print "Stats:" # print "Total Off By > 0.5 ", totalOffByHalf # print "Total Positive ", totalPositive # print "Total Positive Off By Half ", totalPositiveOffByHalf # print "Total Positive Predictions ", totalPositivePredictions #print -probSum/len(prob) print "Score: ", math.sqrt(probSum/len(prob)) avg += math.sqrt(probSum/len(prob))/NumFolds gc.collect() fo = open("test1.csv", "r") predicted_probs = [] for line in fo: line = line.strip().split(",") newRow = [] for item in line: newRow.append(float(item)) predicted_probs.append(clf.predict(newRow)) fo.close() #predicted_probs = clf.predict(testScaled) #predicted_list.append([x[1] for x in predicted_probs]) dataset_blend_test_set[:, foldCount] = predicted_probs #[0] gc.collect() foldCount = foldCount + 1 dataset_blend_test[:,ExecutionIndex] = dataset_blend_test_set.mean(1) #print "Saving NP" #np.savetxt('temp/dataset_blend_test_set.txt', dataset_blend_test_set) #np.savetxt('temp/dataset_blend_test_set.mean.txt', dataset_blend_test_set.mean(1) ) #np.savetxt('temp/dataset_blend_test.txt', dataset_blend_test) #print "Done Saving NP" now = datetime.datetime.now() #print dataset_blend_test_set.mean(1) csv_io.write_delimited_file_single("../predictions/Stack_" + now.strftime("%Y%m%d%H%M%S") + "_" + str(avg) + "_" + str(clf)[:12] + ".csv", dataset_blend_test_set.mean(1)) csv_io.write_delimited_file_single("../predictions/Target_Stack_" + now.strftime("%Y%m%d%H%M%S") + "_" + str(avg) + "_" + str(clf)[:12] + ".csv", dataset_blend_train[:,ExecutionIndex] ) csv_io.write_delimited_file("../predictions/RunLog.csv", [now.strftime("%Y %m %d %H %M %S"), "AVG." , str(avg), str(clf), "Folds:", str(NumFolds), "Model", model, "", ""], filemode="a",delimiter=",") print "------------------------Average: ", avg #np.savetxt('temp/dataset_blend_train.txt', dataset_blend_train) return dataset_blend_train, dataset_blend_test
def run_stack(SEED): train, test = util.get_train_test_df() columns = set(train.columns) columns.remove("SalesID") columns.remove("SalePrice") columns.remove("saledate") train_fea = get_date_dataframe(train["saledate"]) test_fea = get_date_dataframe(test["saledate"]) for col in columns: types = set(type(x) for x in train[col]) if str in types: s = set(x for x in train[col]) str_to_categorical = defaultdict(lambda: -1, [(x[1], x[0]) for x in enumerate(s)]) train_fea = train_fea.join( pd.DataFrame( {col: [str_to_categorical[x] for x in train[col]]}, index=train.index)) test_fea = test_fea.join( pd.DataFrame({col: [str_to_categorical[x] for x in test[col]]}, index=test.index)) else: train_fea = train_fea.join(train[col]) test_fea = test_fea.join(test[col]) model = "" print "Running Stack." avg = 0 NumFolds = 5 # 5 is good, but 10 yeilds a better mean since outliers are less significant. #targetX = csv_io.read_data("target.csv", skipFirstLine = False, split = ",") #trainBase = csv_io.read_data("train1.csv", skipFirstLine = False, split = ",") #test = csv_io_np.read_data("test1.csv", skipFirstLine = False, split = ",") #trainBase = trainBase[0:5000] #targetX = targetX[0:5000] #train_saleID = csv_io.read_data("train_salesID.csv", skipFirstLine = False, split = ",") #test_salesID = csv_io.read_data("test_salesID.csv", skipFirstLine = False, split = ",") predicted_list = [] bootstrapLists = [] clfs = [ GradientBoostingRegressor(loss='lad', learn_rate=0.05, subsample=0.5, max_depth=6, n_estimators=3000, random_state=166, min_samples_leaf=1) ] #GradientBoostingRegressor(loss='ls', learn_rate=0.05, subsample=0.5, max_depth=6, n_estimators=300, random_state=166, min_samples_leaf=1) #GradientBoostingRegressor(loss='lad', learn_rate=0.05, subsample=0.5, max_depth=6, n_estimators=3000, random_state=166, min_samples_leaf=1), #GradientBoostingRegressor(loss='huber', learn_rate=0.05, subsample=0.5, max_depth=6, n_estimators=3000, random_state=166, min_samples_leaf=1), #train_fea, train["SalePrice"] print "Data size: ", len(train_fea), len(test_fea) #dataset_blend_train = np.zeros((len(train_fea), len(clfs))) #dataset_blend_test = np.zeros((len(test), len(clfs))) dataset_blend_test = np.zeros( (len(test_fea), len(clfs))) # np.zeros(len(train_fea), len(clfs)) dataset_blend_train = np.zeros((len(train_fea), len(clfs))) print "Begin Training" lenTrainBase = 401125 # len(train_fea) lenTest = 11573 # len(test_fea) gc.collect() for ExecutionIndex, clf in enumerate(clfs): print clf avg = 0 predicted_list = [] dataset_blend_test_set = np.zeros((lenTest, NumFolds)) foldCount = 0 #Stratified for classification...[trainBase[i][0] for i in range(len(trainBase))] Folds = cross_validation.KFold(lenTrainBase, k=NumFolds, indices=True) for train_index, test_index in Folds: targetX = [train["SalePrice"][i] for i in train_index] trainX = [train_fea.ix[i] for i in train_index] targetTest = [train["SalePrice"][i] for i in test_index] trainTest = [train_fea.ix[i] for i in test_index] gc.collect() print print "Iteration: ", foldCount print "LEN: ", len(trainX), len(targetX) #print trainX[0] #print target[0] #return print "Start", datetime.datetime.now() clf.fit(trainX, targetX) prob = clf.predict(trainTest) print "End ", datetime.datetime.now() dataset_blend_train[test_index, ExecutionIndex] = prob gc.collect() probSum = 0 weightSum = 0 # totalOffByHalf = 0 # totalPositive = 0 # totalPositiveOffByHalf = 0 # totalPositivePredictions = 0 for i in range(0, len(prob)): probX = prob[i] #print targetTest[i], probX if probX < 0: # some are comming out negative. probX = -probX probSum += math.pow( math.log10(targetTest[i]) - math.log10(probX), 2) #probSum += weights[test_index[i]][0] * math.fabs(targetTest[i] - probX) #weightSum += weights[test_index[i]][0] #print "Weight", weights[test_index[i]][0], "Index: ",i, "Test_Index: ",test_index[i] , "Actual: ", targetTest[i], "Predicted: ", probX # log loss cal #probSum += int(targetTest[i])*log(probX)+(1-int(targetTest[i]))*log(1-probX) # if ( math.fabs(probX - int(targetTest[i])) > 0.5 ): # totalOffByHalf = totalOffByHalf + 1 # if ( int(targetTest[i]) == 1 ): # totalPositive = totalPositive + 1 # if ( int(targetTest[i]) == 1 and probX < 0.5): # totalPositiveOffByHalf = totalPositiveOffByHalf + 1 # if (probX > 0.5): # totalPositivePredictions = totalPositivePredictions + 1 # print # print "Stats:" # print "Total Off By > 0.5 ", totalOffByHalf # print "Total Positive ", totalPositive # print "Total Positive Off By Half ", totalPositiveOffByHalf # print "Total Positive Predictions ", totalPositivePredictions #print -probSum/len(prob) print "Score: ", math.sqrt(probSum / len(prob)) avg += math.sqrt(probSum / len(prob)) / NumFolds gc.collect() predicted_probs = [] for i in range(0, lenTest): predicted_probs.append(clf.predict(test_fea.ix[i])) #predicted_probs = clf.predict(testScaled) #predicted_list.append([x[1] for x in predicted_probs]) dataset_blend_test_set[:, foldCount] = predicted_probs #[0] gc.collect() foldCount = foldCount + 1 dataset_blend_test[:, ExecutionIndex] = dataset_blend_test_set.mean(1) #print "Saving NP" #np.savetxt('temp/dataset_blend_test_set.txt', dataset_blend_test_set) #np.savetxt('temp/dataset_blend_test_set.mean.txt', dataset_blend_test_set.mean(1) ) #np.savetxt('temp/dataset_blend_test.txt', dataset_blend_test) #print "Done Saving NP" now = datetime.datetime.now() #print dataset_blend_test_set.mean(1) csv_io.write_delimited_file_single( "../predictions/Stack_" + now.strftime("%Y%m%d%H%M%S") + "_" + str(avg) + "_" + str(clf)[:12] + ".csv", dataset_blend_test_set.mean(1)) csv_io.write_delimited_file_single( "../predictions/Target_Stack_" + now.strftime("%Y%m%d%H%M%S") + "_" + str(avg) + "_" + str(clf)[:12] + ".csv", dataset_blend_train[:, ExecutionIndex]) csv_io.write_delimited_file("../predictions/RunLog.csv", [ now.strftime("%Y %m %d %H %M %S"), "AVG.", str(avg), str(clf), "Folds:", str(NumFolds), "Model", model, "", "" ], filemode="a", delimiter=",") print "------------------------Average: ", avg #np.savetxt('temp/dataset_blend_train.txt', dataset_blend_train) return dataset_blend_train, dataset_blend_test
def run_stack(SEED): model = "" print "Running GB, RF, ET stack." trainBase = csv_io.read_data("../train.csv", skipFirstLine = True, split = ",") test = csv_io.read_data("../test.csv", skipFirstLine = True, split = ",") avg = 0 NumFolds = 5 # 5 is good, but 10 yeilds a better mean since outliers are less significant. (note, predictions are less reliable when using 10). predicted_list = [] bootstrapLists = [] # use this for quick runs. # note RF with 150 crashes on 30 features # GradientBoostingRegressor(learn_rate=0.05, subsample=0.5, max_depth=6, n_estimators=50), # GradientBoostingRegressor(learn_rate=0.02, subsample=0.2, max_depth=8, n_estimators=125), # RandomForestRegressor(n_estimators=100, n_jobs=1), #RandomForestRegressor(n_estimators=75, n_jobs=1), # clfs = [ExtraTreesRegressor(n_estimators=50, min_density=0.2, n_jobs=1, bootstrap=True, random_state=1), # SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0, degree=3, gamma=2**-5.5, kernel='rbf', probability=False, shrinking=True, tol=0.001, verbose=False) # ] #knn 5 at 3.45 #knn 15 at 3.31 #knn 25 at 3.30 #knn 40 at 3.31 # KNeighborsRegressor(n_neighbors=5, weights='uniform', algorithm='auto', leaf_size=30, warn_on_equidistant=False, p=2), # KNeighborsRegressor(n_neighbors=15, weights='uniform', algorithm='auto', leaf_size=30, warn_on_equidistant=False, p=2), # KNeighborsRegressor(n_neighbors=25, weights='uniform', algorithm='auto', leaf_size=30, warn_on_equidistant=False, p=2), # LinearRegression at 3.77 # Ridge at 3.77 # SGD 4.23 #Gauss at 13 # LinearRegression(fit_intercept=True, normalize=False, copy_X=True), # Ridge(alpha=1.0, fit_intercept=True, normalize=False, copy_X=True, tol=0.001), # SGDRegressor(loss='squared_loss', penalty='l2', alpha=0.0001, rho=0.84999999999999998, fit_intercept=True, n_iter=5, shuffle=False, verbose=0, epsilon=0.10000000000000001, p=None, seed=0, learning_rate='invscaling', eta0=0.01, power_t=0.25, warm_start=False), # GaussianNB() # clfs = [KNeighborsRegressor(n_neighbors=15, weights='uniform', algorithm='auto', leaf_size=30, warn_on_equidistant=False, p=2), # KNeighborsRegressor(n_neighbors=25, weights='uniform', algorithm='auto', leaf_size=30, warn_on_equidistant=False, p=2),KNeighborsRegressor(n_neighbors=35, weights='uniform', algorithm='auto', leaf_size=30, warn_on_equidistant=False, p=2) # ] # GB, 125 est is minimum, score is bad below this, explore higher and other dimensions. ****************** # clfs = [GradientBoostingRegressor(learn_rate=0.05, subsample=0.5, max_depth=6, n_estimators=100, random_state=166), # GradientBoostingRegressor(learn_rate=0.05, subsample=0.5, max_depth=8, n_estimators=100, random_state=166), # GradientBoostingRegressor(learn_rate=0.05, subsample=0.5, max_depth=10, n_estimators=100, random_state=166), # GradientBoostingRegressor(learn_rate=0.05, subsample=0.5, max_depth=6, n_estimators=200, random_state=166), # GradientBoostingRegressor(learn_rate=0.05, subsample=0.5, max_depth=8, n_estimators=200, random_state=166),GradientBoostingRegressor(learn_rate=0.05, subsample=0.5, max_depth=10, n_estimators=200, random_state=166) # ] # about 1 hour run time, and 3.10 score. #GradientBoostingRegressor(learn_rate=0.05, subsample=0.5, max_depth=6, n_estimators=400, random_state=166) # about 2 hours run time at 3.05 #GradientBoostingRegressor(learn_rate=0.05, subsample=0.5, max_depth=8, n_estimators=400, random_state=166) # about 2 hours run time at 3.06 #GradientBoostingRegressor(learn_rate=0.05, subsample=0.5, max_depth=6, n_estimators=800, random_state=166) # about 4 hours run time at 3.06 #GradientBoostingRegressor(learn_rate=0.05, subsample=0.5, max_depth=8, n_estimators=800, random_state=166) #SGDClassifier(loss='hinge', penalty='l2', alpha=0.0001, l1_ratio=0.15, fit_intercept=True, n_iter=5, shuffle=False, verbose=0, epsilon=0.1, n_jobs=1, random_state=None, learning_rate='optimal', eta0=0.0, power_t=0.5, class_weight=None, warm_start=False, rho=None, seed=None) # http://stackoverflow.com/questions/15150339/python-memory-error-sklearn-huge-input-data #For high dimensional sparse data and many samples, LinearSVC, LogisticRegression, # PassiveAggressiveClassifier or SGDClassifier can be much faster to train for comparable predictive accuracy. # LogisticRegression(penalty='l2', dual=False, tol=0.0001, C=1.0, fit_intercept=True, intercept_scaling=1, class_weight=None, random_state=None) # LinearSVC(penalty='l2', loss='l2', dual=True, tol=0.0001, C=1.0, multi_class='ovr', fit_intercept=True, intercept_scaling=1, class_weight=None, verbose=0, random_state=None) # PassiveAggressiveClassifier(C=1.0, fit_intercept=True, n_iter=5, shuffle=False, verbose=0, loss='hinge', n_jobs=1, random_state=None, warm_start=False) # SGDClassifier(loss='hinge', penalty='l2', alpha=0.0001, l1_ratio=0.15, fit_intercept=True, n_iter=5, shuffle=False, verbose=0, epsilon=0.1, n_jobs=1, random_state=None, learning_rate='optimal', eta0=0.0, power_t=0.5, class_weight=None, warm_start=False, rho=None, seed=None) clfs = [RandomForestClassifier(n_estimators=500, n_jobs=1, criterion='gini') ] # best SVC(C=1000000.0, kernel='rbf', degree=3, gamma=0.0, coef0=0.0, shrinking=True, probability=False, tol=0.001, cache_size=200, class_weight=None, verbose=False, max_iter=-1), # best LogisticRegression(penalty='l2', dual=False, tol=0.0001, C=1000.0, fit_intercept=True, intercept_scaling=1, class_weight=None, random_state=None), #SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0, degree=3,gamma=0.0, kernel='rbf', max_iter=-1, probability=False, shrinking=True,tol=0.001, verbose=False) # use this for quick runs. # clfs = [GradientBoostingClassifier(learn_rate=0.05, subsample=0.5, max_depth=6, n_estimators=50, random_state=166), # GradientBoostingClassifier(learn_rate=0.02, subsample=0.2, max_depth=8, n_estimators=125, random_state=551), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=80, random_state=441), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=3, n_estimators=80, random_state=331), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=4, n_estimators=80, random_state=221), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=120, random_state=91), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=3, n_estimators=120, random_state=81), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=4, n_estimators=120, random_state=71), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=160, random_state=61), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=3, n_estimators=160, random_state=51), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=4, n_estimators=160, random_state=41), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=1, n_estimators=200, random_state=31), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=200, random_state=21), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=3, n_estimators=200, random_state=10), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=4, n_estimators=200, random_state=19), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=1, n_estimators=240, random_state=18), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=240, random_state=17), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=3, n_estimators=240, random_state=16), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=1, n_estimators=280, random_state=15), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=280, random_state=14), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=3, n_estimators=280, random_state=13), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=1, n_estimators=320, random_state=12), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=320, random_state=11), # RandomForestClassifier(n_estimators=100, n_jobs=1, criterion='gini'), # RandomForestClassifier(n_estimators=100, n_jobs=1, criterion='entropy'), # ExtraTreesClassifier(n_estimators=150, min_density=0.2, n_jobs=1, criterion='gini', bootstrap=True, random_state=1), # ExtraTreesClassifier(n_estimators=150, min_density=0.2, n_jobs=1, criterion='entropy', bootstrap=True, random_state=5)] # use this for quick runs. reduced estimators to 50 # clfs = [SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0, degree=3, # gamma=2**-5.5, kernel='rbf', probability=False, shrinking=True, # tol=0.001, verbose=False) # ] #GradientBoostingRegressor(learn_rate=0.05, subsample=0.5, max_depth=6, n_estimators=50), #ExtraTreesRegressor(n_estimators=50, min_density=0.2, n_jobs=1, bootstrap=True, random_state=1) # clfs = [ExtraTreesClassifier(n_estimators=150, min_density=0.2, n_jobs=1, criterion='gini', bootstrap=True, random_state=1), # ExtraTreesClassifier(n_estimators=150, min_density=0.09, n_jobs=1, criterion='gini', bootstrap=True, random_state=2), # ExtraTreesClassifier(n_estimators=150, min_density=0.2, n_jobs=1, criterion='entropy', bootstrap=True, random_state=5), # ExtraTreesClassifier(n_estimators=150, min_density=0.09, n_jobs=1, criterion='entropy', bootstrap=True, random_state=6), # GradientBoostingClassifier(learn_rate=0.05, subsample=0.5, max_depth=6, n_estimators=50), # GradientBoostingClassifier(learn_rate=0.02, subsample=0.2, max_depth=8, n_estimators=125), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=80), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=3, n_estimators=80), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=4, n_estimators=80), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=120), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=3, n_estimators=120), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=4, n_estimators=120), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=160), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=3, n_estimators=160), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=4, n_estimators=160), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=1, n_estimators=200), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=200), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=3, n_estimators=200), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=4, n_estimators=200), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=1, n_estimators=240), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=240), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=3, n_estimators=240), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=1, n_estimators=280), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=280), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=3, n_estimators=280), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=1, n_estimators=320), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=320), # RandomForestClassifier(n_estimators=150, min_density=0.2, n_jobs=1, criterion='gini', bootstrap=True, random_state=1), # RandomForestClassifier(n_estimators=150, min_density=0.09, n_jobs=1, criterion='gini', bootstrap=True, random_state=2), # RandomForestClassifier(n_estimators=150, min_density=0.2, n_jobs=1, criterion='entropy', bootstrap=True, random_state=5), # RandomForestClassifier(n_estimators=150, min_density=0.05, n_jobs=1, criterion='entropy', bootstrap=True, random_state=7)] # full algorithm stack. # clfs = [ExtraTreesClassifier(n_estimators=150, min_density=0.2, n_jobs=1, criterion='gini', bootstrap=True, random_state=1), # ExtraTreesClassifier(n_estimators=150, min_density=0.09, n_jobs=1, criterion='gini', bootstrap=True, random_state=2), # ExtraTreesClassifier(n_estimators=150, min_density=0.05, n_jobs=1, criterion='gini', bootstrap=True, random_state=3), # ExtraTreesClassifier(n_estimators=150, min_density=0.02, n_jobs=1, criterion='gini', bootstrap=True, random_state=4), # ExtraTreesClassifier(n_estimators=150, min_density=0.2, n_jobs=1, criterion='entropy', bootstrap=True, random_state=5), # ExtraTreesClassifier(n_estimators=150, min_density=0.09, n_jobs=1, criterion='entropy', bootstrap=True, random_state=6), # ExtraTreesClassifier(n_estimators=150, min_density=0.05, n_jobs=1, criterion='entropy', bootstrap=True, random_state=7), # ExtraTreesClassifier(n_estimators=150, min_density=0.02, n_jobs=1, criterion='entropy', bootstrap=True, random_state=8), # GradientBoostingClassifier(learn_rate=0.05, subsample=0.5, max_depth=6, n_estimators=50), # GradientBoostingClassifier(learn_rate=0.02, subsample=0.2, max_depth=8, n_estimators=125), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=80), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=3, n_estimators=80), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=4, n_estimators=80), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=120), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=3, n_estimators=120), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=4, n_estimators=120), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=160), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=3, n_estimators=160), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=4, n_estimators=160), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=1, n_estimators=200), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=200), # GradientBoostingClassifier(lesarn_rate=0.01, subsample=0.2, max_depth=3, n_estimators=200), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=4, n_estimators=200), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=1, n_estimators=240), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=240), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=3, n_estimators=240), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=1, n_estimators=280), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=280), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=3, n_estimators=280), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=1, n_estimators=320), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=320), # RandomForestClassifier(n_estimators=150, min_density=0.2, n_jobs=1, criterion='gini', bootstrap=True, random_state=1), # RandomForestClassifier(n_estimators=150, min_density=0.09, n_jobs=1, criterion='gini', bootstrap=True, random_state=2), # RandomForestClassifier(n_estimators=150, min_density=0.05, n_jobs=1, criterion='gini', bootstrap=True, random_state=3), # RandomForestClassifier(n_estimators=150, min_density=0.02, n_jobs=1, criterion='gini', bootstrap=True, random_state=4), # RandomForestClassifier(n_estimators=150, min_density=0.2, n_jobs=1, criterion='entropy', bootstrap=True, random_state=5), # RandomForestClassifier(n_estimators=150, min_density=0.09, n_jobs=1, criterion='entropy', bootstrap=True, random_state=6), # RandomForestClassifier(n_estimators=150, min_density=0.05, n_jobs=1, criterion='entropy', bootstrap=True, random_state=7), # RandomForestClassifier(n_estimators=150, min_density=0.02, n_jobs=1, criterion='entropy', bootstrap=True, random_state=8)] print "Data size: ", len(trainBase), len(test) dataset_blend_train = np.zeros((len(trainBase), len(clfs))) dataset_blend_test = np.zeros((len(test), len(clfs))) trainNew = [] trainTestNew = [] testNew = [] trainNewSelect = [] trainTestNewSelect = [] testNewSelect = [] print "Scaling" #targetPre = [x[0] for x in trainBase] #trainPre = [x[1:] for x in trainBase] #trainPreTemp = [x[1:] for x in trainBase] #testPre = [x[1:] for x in test] targetPre = [int(x[0]) for x in trainBase] trainPre = [[int(i) for i in x[1:]] for x in trainBase] trainPreTemp = [[int(i) for i in x[1:]] for x in trainBase] testPre = [[int(i) for i in x[1:]] for x in test] print "unique: ", len(list(set([x[1] for x in trainBase]))) #enc = OneHotEncoder() #print len(trainPreTemp) #trainPreTemp.extend(testPre) #print len(trainPreTemp) #enc.fit(trainPreTemp) #print enc.n_values_ #print enc.feature_indices_ #out = enc.transform(trainPre) #trainPre = out#.toarray() #print out.shape # len(out), len(out[0]) #out = enc.transform(testPre) #testPre = out#.toarray() #print out.shape km = KMeans(n_clusters=10, init='k-means++', n_init=100, max_iter=300, tol=0.0001, precompute_distances=True, verbose=0, random_state=None, copy_x=True, n_jobs=1).fit(trainPre) #return #print trainPre[0] #scaler = preprocessing.Scaler().fit(trainPre) #trainScaled = scaler.transform(trainPre) #testScaled = scaler.transform(testPre) #print scaler.mean_ #print scaler.std_ print "Begin Training" for ExecutionIndex, clf in enumerate(clfs): print clf avg = 0 predicted_list = [] dataset_blend_test_set = np.zeros((len(test), NumFolds)) foldCount = 0 #Stratified for classification...[trainBase[i][0] for i in range(len(trainBase))] #Folds = cross_validation.StratifiedKFold(targetPre, n_folds=NumFolds, indices=True) Folds = cross_validation.KFold(len(trainBase), n_folds=NumFolds, indices=True) for train_index, test_index in Folds: #trainBaseTemp = [trainBase[i] for i in train_index] #target = [x[0] for x in trainBaseTemp] #train = [x[1:] for x in trainBaseTemp] #testBaseTemp = [trainBase[i] for i in test_index] #targetTest = [x[0] for x in testBaseTemp] #trainTest = [x[1:] for x in testBaseTemp] #test = [x[0:] for x in test] target = [targetPre[i] for i in train_index] train = [trainPre[i] for i in train_index] #train = trainPre.tocsr()[train_index,:] targetTest = [targetPre[i] for i in test_index] trainTest = [trainPre[i] for i in test_index] #trainTest = trainPre.tocsr()[test_index,:] print print "Iteration: ", foldCount #print "LEN: ", len(train), len(target) train = km.transform(train) trainTest = km.transform(trainTest) clf.fit(train, target) print "Predict" prob = clf.predict_proba(trainTest) print "Score" dataset_blend_train[test_index, ExecutionIndex] = prob[:,1] probSum = 0 weightSum = 0 # totalOffByHalf = 0 # totalPositive = 0 # totalPositiveOffByHalf = 0 # totalPositivePredictions = 0 fpr, tpr, thresholds = metrics.roc_curve(targetTest, prob[:,1], pos_label=1) auc = metrics.auc(fpr,tpr) print "Score: ", auc #for i in range(0, len(prob)): #print prob #probX = prob[i] #probSum += weights[test_index[i]][0] * math.fabs(targetTest[i] - probX) #weightSum += weights[test_index[i]][0] #print "Weight", weights[test_index[i]][0], "Index: ",i, "Test_Index: ",test_index[i] , "Actual: ", targetTest[i], "Predicted: ", probX # log loss cal #probSum += int(targetTest[i])*log(probX)+(1-int(targetTest[i]))*log(1-probX) # if ( math.fabs(probX - int(targetTest[i])) > 0.5 ): # totalOffByHalf = totalOffByHalf + 1 # if ( int(targetTest[i]) == 1 ): # totalPositive = totalPositive + 1 # if ( int(targetTest[i]) == 1 and probX < 0.5): # totalPositiveOffByHalf = totalPositiveOffByHalf + 1 # if (probX > 0.5): # totalPositivePredictions = totalPositivePredictions + 1 # print # print "Stats:" # print "Total Off By > 0.5 ", totalOffByHalf # print "Total Positive ", totalPositive # print "Total Positive Off By Half ", totalPositiveOffByHalf # print "Total Positive Predictions ", totalPositivePredictions #print -probSum/len(prob) #print "Score: ", probSum/weightSum avg += auc/NumFolds predicted_probs = clf.predict_proba(testPre) #predicted_list.append([x[1] for x in predicted_probs]) dataset_blend_test_set[:, foldCount] = predicted_probs[:,1] #[0] foldCount = foldCount + 1 break dataset_blend_test[:,ExecutionIndex] = dataset_blend_test_set.mean(1) #print "Saving NP" #np.savetxt('temp/dataset_blend_test_set.txt', dataset_blend_test_set) #np.savetxt('temp/dataset_blend_test_set.mean.txt', dataset_blend_test_set.mean(1) ) #np.savetxt('temp/dataset_blend_test.txt', dataset_blend_test) #print "Done Saving NP" now = datetime.datetime.now() #print dataset_blend_test_set.mean(1) csv_io.write_delimited_file_single_plus_index("../predictions/Stack_" + now.strftime("%Y%m%d%H%M%S") + "_" + str(avg) + "_" + str(clf)[:12] + ".csv", dataset_blend_test_set.mean(1)) csv_io.write_delimited_file_single("../predictions/Target_Stack_" + now.strftime("%Y%m%d%H%M%S") + "_" + str(avg) + "_" + str(clf)[:12] + ".csv", dataset_blend_train[:,ExecutionIndex] ) csv_io.write_delimited_file("../predictions/RunLog.csv", [now.strftime("%Y %m %d %H %M %S"), str(avg), str(clf), str(NumFolds), model, "", ""], filemode="a",delimiter=",") print "------------------------Average: ", avg #np.savetxt('temp/dataset_blend_train.txt', dataset_blend_train) return dataset_blend_train, dataset_blend_test
def run_stack(SEED): model = "" print "Running Stack." avg = 0 NumFolds = 5 # 5 is good, but 10 yeilds a better mean since outliers are less significant. targetX = csv_io.read_data("target.csv", skipFirstLine=False, split=",") trainBase = csv_io.read_data("train1.csv", skipFirstLine=False, split=",") #test = csv_io_np.read_data("test1.csv", skipFirstLine = False, split = ",") trainBase = trainBase[0:5000] targetX = targetX[0:5000] train_saleID = csv_io.read_data("train_salesID.csv", skipFirstLine=False, split=",") test_salesID = csv_io.read_data("test_salesID.csv", skipFirstLine=False, split=",") predicted_list = [] bootstrapLists = [] clfs = [ GradientBoostingRegressor(loss='ls', learn_rate=0.05, subsample=0.5, max_depth=6, n_estimators=300, random_state=166, min_samples_leaf=1) ] #GradientBoostingRegressor(loss='ls', learn_rate=0.05, subsample=0.5, max_depth=6, n_estimators=300, random_state=166, min_samples_leaf=1) #GradientBoostingRegressor(loss='lad', learn_rate=0.05, subsample=0.5, max_depth=6, n_estimators=3000, random_state=166, min_samples_leaf=1), #GradientBoostingRegressor(loss='huber', learn_rate=0.05, subsample=0.5, max_depth=6, n_estimators=3000, random_state=166, min_samples_leaf=1), print "Data size: ", len(trainBase), 11573 # len(test) dataset_blend_train = np.zeros((len(trainBase), len(clfs))) #dataset_blend_test = np.zeros((len(test), len(clfs))) dataset_blend_test = np.zeros(11573, len(clfs)) #targetPre = target #[0:5000] #testScaled = test #trainScaled = trainBase #[0:5000] #targetPre = target #[0:5000] #testScaled = test #trainScaled = trainBase #[0:5000] print "Begin Training" lenTrainBase = len(trainBase) #lenTrainBase = len(trainBase[0:5000]) lenTest = 11573 #lenTest = len(test) gc.collect() for ExecutionIndex, clf in enumerate(clfs): print clf avg = 0 predicted_list = [] dataset_blend_test_set = np.zeros((lenTest, NumFolds)) foldCount = 0 #Stratified for classification...[trainBase[i][0] for i in range(len(trainBase))] Folds = cross_validation.KFold(lenTrainBase, k=NumFolds, indices=True) for train_index, test_index in Folds: target = [targetX[i] for i in train_index] train = [trainBase[i] for i in train_index] targetTest = [targetX[i] for i in test_index] trainTest = [trainBase[i] for i in test_index] #target = [targetPre[i] for i in train_index] #train = [trainScaled[i] for i in train_index] #targetTest = [targetPre[i] for i in test_index] #trainTest = [trainScaled[i] for i in test_index] gc.collect() print print "Iteration: ", foldCount print "LEN: ", len(train), len(target) #print train[0] #print target[0] #return print "Start", datetime.datetime.now() clf.fit(train, target) prob = clf.predict(trainTest) print "End ", datetime.datetime.now() dataset_blend_train[test_index, ExecutionIndex] = prob gc.collect() probSum = 0 weightSum = 0 # totalOffByHalf = 0 # totalPositive = 0 # totalPositiveOffByHalf = 0 # totalPositivePredictions = 0 for i in range(0, len(prob)): probX = prob[i] probX = 31100.0 print targetTest[i][0], probX probSum += math.pow( math.log10(targetTest[i][0]) - math.log10(probX), 2) #probSum += weights[test_index[i]][0] * math.fabs(targetTest[i] - probX) #weightSum += weights[test_index[i]][0] #print "Weight", weights[test_index[i]][0], "Index: ",i, "Test_Index: ",test_index[i] , "Actual: ", targetTest[i], "Predicted: ", probX # log loss cal #probSum += int(targetTest[i])*log(probX)+(1-int(targetTest[i]))*log(1-probX) # if ( math.fabs(probX - int(targetTest[i])) > 0.5 ): # totalOffByHalf = totalOffByHalf + 1 # if ( int(targetTest[i]) == 1 ): # totalPositive = totalPositive + 1 # if ( int(targetTest[i]) == 1 and probX < 0.5): # totalPositiveOffByHalf = totalPositiveOffByHalf + 1 # if (probX > 0.5): # totalPositivePredictions = totalPositivePredictions + 1 # print # print "Stats:" # print "Total Off By > 0.5 ", totalOffByHalf # print "Total Positive ", totalPositive # print "Total Positive Off By Half ", totalPositiveOffByHalf # print "Total Positive Predictions ", totalPositivePredictions #print -probSum/len(prob) print "Score: ", math.sqrt(probSum / len(prob)) avg += math.sqrt(probSum / len(prob)) / NumFolds gc.collect() fo = open("test1.csv", "r") predicted_probs = [] for line in fo: line = line.strip().split(",") newRow = [] for item in line: newRow.append(float(item)) predicted_probs.append(clf.predict(newRow)) fo.close() #predicted_probs = clf.predict(testScaled) #predicted_list.append([x[1] for x in predicted_probs]) dataset_blend_test_set[:, foldCount] = predicted_probs #[0] gc.collect() foldCount = foldCount + 1 dataset_blend_test[:, ExecutionIndex] = dataset_blend_test_set.mean(1) #print "Saving NP" #np.savetxt('temp/dataset_blend_test_set.txt', dataset_blend_test_set) #np.savetxt('temp/dataset_blend_test_set.mean.txt', dataset_blend_test_set.mean(1) ) #np.savetxt('temp/dataset_blend_test.txt', dataset_blend_test) #print "Done Saving NP" now = datetime.datetime.now() #print dataset_blend_test_set.mean(1) csv_io.write_delimited_file_single( "../predictions/Stack_" + now.strftime("%Y%m%d%H%M%S") + "_" + str(avg) + "_" + str(clf)[:12] + ".csv", dataset_blend_test_set.mean(1)) csv_io.write_delimited_file_single( "../predictions/Target_Stack_" + now.strftime("%Y%m%d%H%M%S") + "_" + str(avg) + "_" + str(clf)[:12] + ".csv", dataset_blend_train[:, ExecutionIndex]) csv_io.write_delimited_file("../predictions/RunLog.csv", [ now.strftime("%Y %m %d %H %M %S"), "AVG.", str(avg), str(clf), "Folds:", str(NumFolds), "Model", model, "", "" ], filemode="a", delimiter=",") print "------------------------Average: ", avg #np.savetxt('temp/dataset_blend_train.txt', dataset_blend_train) return dataset_blend_train, dataset_blend_test
def run_stack(SEED): model = "base" trainBase = csv_io_np.read_data("PreProcessData/train.csv", skipFirstLine=True, split=",") test = csv_io_np.read_data("PreProcessData/test.csv", skipFirstLine=True, split=",") print "Data Read Complete" avg = 0 NumFolds = 5 predicted_list = [] bootstrapLists = [] # 100 producted 94% # 1000 did not finish in about 5+ hours... # 300 about 5 hours, .9691 on first CF # learn_rate=0.01, n_estimators=300, subsample=1.0, min_samples_split=30, 0.9386 # GradientBoostingClassifier(loss='deviance', learn_rate=0.1, n_estimators=300, subsample=1.0, min_samples_split=30, min_samples_leaf=1, max_depth=5, init=None, random_state=None, max_features=None) # Leader board of 98443, for 20th place. #SVC(C=10**6, kernel='rbf', degree=3, gamma=10**-6.35, coef0=0.0, shrinking=True, probability=False, tol=0.001, cache_size=200, class_weight=None, verbose=False) clfs = [ SVC(C=10**6, kernel='rbf', degree=3, gamma=10**-6.35, coef0=0.0, shrinking=True, probability=False, tol=0.001, cache_size=200, class_weight=None, verbose=False) ] print "Data size: ", len(trainBase), len(test) dataset_blend_train = np.zeros((len(trainBase), len(clfs))) dataset_blend_test = np.zeros((len(test), len(clfs))) trainNew = [] trainTestNew = [] testNew = [] trainNewSelect = [] trainTestNewSelect = [] testNewSelect = [] print "Scaling" targetPre = [x[0] for x in trainBase] trainPre = [x[1:] for x in trainBase] testPre = [x[0:] for x in test] # image best restuls [-1, 1] #preprocessing.MinMaxScaler(feature_range=(0, 1), copy=True) #print trainPre[0] #scaler = preprocessing.Scaler().fit(trainPre) #trainScaled = scaler.transform(trainPre) #testScaled = scaler.transform(testPre) trainScaled = trainPre testScaled = testPre #print scaler.mean_ #print scaler.std_ print "Begin Training" lenTrainBase = len(trainBase) trainBase = [] lenTest = len(test) test = [] trainPre = [] testPre = [] gc.collect() for ExecutionIndex, clf in enumerate(clfs): print clf avg = 0 predicted_list = [] dataset_blend_test_set = np.zeros((lenTest, NumFolds)) foldCount = 0 #Stratified for classification...[trainBase[i][0] for i in range(len(trainBase))] Folds = cross_validation.KFold(lenTrainBase, k=NumFolds, indices=True) for train_index, test_index in Folds: #trainBaseTemp = [trainBase[i] for i in train_index] #target = [x[0] for x in trainBaseTemp] #train = [x[1:] for x in trainBaseTemp] #testBaseTemp = [trainBase[i] for i in test_index] #targetTest = [x[0] for x in testBaseTemp] #trainTest = [x[1:] for x in testBaseTemp] #test = [x[0:] for x in test] target = [targetPre[i] for i in train_index] train = [trainScaled[i] for i in train_index] targetTest = [targetPre[i] for i in test_index] trainTest = [trainScaled[i] for i in test_index] print print "Iteration: ", foldCount print "LEN: ", len(train), len( train[0]), len(target), len(trainTest), len(trainTest[0]) print datetime.datetime.now() clf.fit(train, target) print datetime.datetime.now() prob = clf.predict(trainTest) dataset_blend_train[test_index, ExecutionIndex] = prob probSum = 0.0 count = 0.0 for i in range(0, len(prob)): probX = prob[i] #[1] #print probX, targetTest[i] if (targetTest[i] == probX): probSum += 1.0 count = count + 1.0 print "Sum: ", probSum, count print "Score: ", probSum / count avg += (probSum / count) / NumFolds #predicted_probs = clf.predict(testScaled) ######predicted_list.append([x[1] for x in predicted_probs]) #dataset_blend_test_set[:, foldCount] = predicted_probs #[0] foldCount = foldCount + 1 print "Final Train", datetime.datetime.now() clf.fit(trainScaled, targetPre) # must to this for multiclass classification... print "Final Predict", datetime.datetime.now() predicted_probs = clf.predict(testScaled) print "Writing Data", datetime.datetime.now() #dataset_blend_test[:,ExecutionIndex] = dataset_blend_test_set.mean(1) now = datetime.datetime.now() #csv_io.write_delimited_file_single("../predictions/Stack_" + now.strftime("%Y%m%d%H%M%S") + "_" + str(avg) + "_" + str(clf)[:12] + ".csv", dataset_blend_test_set.mean(1)) csv_io.write_delimited_file_single( "../predictions/Stack_" + now.strftime("%Y%m%d%H%M%S") + "_" + str(avg) + "_" + str(clf)[:12] + ".csv", predicted_probs) # for multiclass csv_io.write_delimited_file_single( "../predictions/Target_Stack_" + now.strftime("%Y%m%d%H%M%S") + "_" + str(avg) + "_" + str(clf)[:12] + ".csv", dataset_blend_train[:, ExecutionIndex]) csv_io.write_delimited_file("../predictions/RunLog.csv", [ now.strftime("%Y %m %d %H %M %S"), "AVG.", str(avg), str(clf), "Folds:", str(NumFolds), "Model", model, "", "" ], filemode="a", delimiter=",") print "------------------------Average: ", avg return dataset_blend_train, dataset_blend_test
def Blend(): lossThreshold = 4.0 # best seems to be about 4.0 model = "Long-Lat KNN5" #used only for targets values. trainBase = csv_io.read_data("PreProcessData/training_PreProcess4.csv", skipFirstLine = False, split = "\t") test = csv_io.read_data("PreProcessData/test_PreProcess4.csv", False) weights = csv_io.read_data("PreProcessData/Weights.csv", skipFirstLine = False) target = [x[0] for x in trainBase] stackFiles = [] for filename in os.listdir("../predictions"): parts = filename.split("_") if ( filename[0:5] == "Stack" and float(parts[2]) < lossThreshold): stackFiles.append(filename) dataset_blend_train = np.zeros((len(trainBase), len(stackFiles))) dataset_blend_test = np.zeros((len(test), len(stackFiles))) print "Loading Data" for fileNum, file in enumerate(stackFiles): print file trn = csv_io.read_data("../predictions/Target_" + file, split="," ,skipFirstLine = False) for row, datum in enumerate(trn): dataset_blend_train[row, fileNum] = datum[0] tst = csv_io.read_data("../predictions/" + file, split="," ,skipFirstLine = False) for row, datum in enumerate(tst): dataset_blend_test[row, fileNum] = datum[0] np.savetxt('temp/dataset_blend_trainX.txt', dataset_blend_train) np.savetxt('temp/dataset_blend_testX.txt', dataset_blend_test) print "Num file processed: ", len(stackFiles), "Threshold: ", lossThreshold # linear 3.15 -> 3.42 # RF 1.2 -> 3.5 # GB (125) 3.15 print "Starting Blend" #GB 400 is 3.11 #GB 400 max_depth=14 is 2.82 greater depth is better. # GB seems to overfit. scores drop for 100 estimators = 3.33 with linear on same code, 3.27 # might try smaller numbers in gb than 20 depth and 100 est with prevent overfitting. # clfs = [ # GradientBoostingRegressor(learn_rate=0.05, subsample=0.2, max_depth=20, n_estimators=400, random_state=551), # GradientBoostingRegressor(learn_rate=0.05, subsample=0.2, max_depth=30, n_estimators=400, random_state=551), # GradientBoostingRegressor(learn_rate=0.05, subsample=0.2, max_depth=40, n_estimators=400, random_state=551), # GradientBoostingRegressor(learn_rate=0.05, subsample=0.2, max_depth=80, n_estimators=400, random_state=551), # GradientBoostingRegressor(learn_rate=0.05, subsample=0.2, max_depth=20, n_estimators=800, random_state=551), # GradientBoostingRegressor(learn_rate=0.05, subsample=0.2, max_depth=30, n_estimators=800, random_state=551), # GradientBoostingRegressor(learn_rate=0.05, subsample=0.2, max_depth=40, n_estimators=800, random_state=551), # GradientBoostingRegressor(learn_rate=0.05, subsample=0.2, max_depth=80, n_estimators=800, random_state=551) # ] clfs = [Ridge(alpha=1.0, fit_intercept=True, normalize=False, copy_X=True, tol=0.001) ] # this returned 2.95 when linear returned 3.06, need to check for overfitting. #KNeighborsRegressor(n_neighbors=15, weights='uniform', algorithm='auto', leaf_size=30, warn_on_equidistant=False, p=2) # linear 3.06, lasso is 3.06 #Lasso(alpha=1.0, fit_intercept=True, normalize=False, precompute='auto', copy_X=True, max_iter=1000, tol=0.0001, warm_start=False, positive=False) #linear 3.06, ridge 3.05 #Ridge(alpha=1.0, fit_intercept=True, normalize=False, copy_X=True, tol=0.001) #linear 3.06, SVC 2.77, not sure if overfitting, need to submit to test************** #SVC(C=1.0, kernel='rbf', degree=3, gamma=0.0, coef0=0.0, shrinking=True, probability=False, tol=0.001, cache_size=200, class_weight=None, verbose=False) # clfs = [LinearRegression(fit_intercept=True, normalize=False, copy_X=True) # ] #LinearRegression(fit_intercept=True, normalize=False, copy_X=True) # use for classification probablilities # clfs = [LogisticRegression(penalty='l2', dual=False, tol=0.0001, C=1.0, fit_intercept=True, intercept_scaling=1, class_weight=None), # LogisticRegression(penalty='l2', dual=False, tol=0.0001, C=0.5, fit_intercept=True, intercept_scaling=1, class_weight=None), # LogisticRegression(penalty='l2', dual=False, tol=0.0001, C=0.1, fit_intercept=True, intercept_scaling=1, class_weight=None), # LogisticRegression(penalty='l1', dual=False, tol=0.0001, C=1.0, fit_intercept=True, intercept_scaling=1, class_weight=None), # LogisticRegression(penalty='l1', dual=False, tol=0.0001, C=0.5, fit_intercept=True, intercept_scaling=1, class_weight=None), # LogisticRegression(penalty='l1', dual=False, tol=0.0001, C=0.1, fit_intercept=True, intercept_scaling=1, class_weight=None)] dataset_blend_test_set = np.zeros((len(test), len(clfs))) avgScore = 0.0 for ExecutionIndex, clf in enumerate(clfs): print clf clf.fit(dataset_blend_train, target) submission = clf.predict(dataset_blend_test) submission = ["%f" % x for x in submission] now = datetime.datetime.now() # attempt to score the training set to predict score for blend... probSum = 0.0 weightSum = 0 trainPrediction = clf.predict(dataset_blend_train) for i in range(0, len(trainPrediction)): probX = trainPrediction[i] probSum += weights[i][0] * math.fabs(target[i] - probX) weightSum += weights[i][0] #probSum += int(target[i])*log(probX)+(1-int(target[i]))*log(1-probX) print "Train Score: ", (probSum/weightSum) avgScore += (probSum/weightSum) csv_io.write_delimited_file("../blend/BlendSingle" + now.strftime("%Y%m%d%H%M%S") + "_" + str(probSum/weightSum)+ "_" + str(clf)[:12] + ".csv", submission) csv_io.write_delimited_file("../blend/RunLog.csv", [now.strftime("%Y %m %d %H %M %S"), str(avgScore/len(clfs)), str(clf), "1", model, "", "", ", ".join(stackFiles)], filemode="a",delimiter=",") dataset_blend_test_set[:, ExecutionIndex] = submission print "Final Score: ", str(avgScore/len(clfs)) csv_io.write_delimited_file_single("../blend/FinalBlend_" + now.strftime("%Y%m%d%H%M%S") + "_" + str(avgScore/len(clfs)) + ".csv", dataset_blend_test_set.mean(1))
predicted_probs = clf.predict_proba(finalTestSparse) #predicted_list.append([x[1] for x in predicted_probs]) dataset_blend_test_set[:, foldCount] = predicted_probs[:, 1] foldCount = foldCount + 1 #break dataset_blend_test[:, ExecutionIndex] = dataset_blend_test_set.mean(1) now = datetime.datetime.now() csv_io.write_delimited_file_single_plus_index( "../predictions/Stack_" + now.strftime("%Y%m%d%H%M%S") + "_" + str(avg) + "_" + str(clf)[:12] + ".csv", dataset_blend_test_set.mean(1)) csv_io.write_delimited_file_single( "../predictions/Target_Stack_" + now.strftime("%Y%m%d%H%M%S") + "_" + str(avg) + "_" + str(clf)[:12] + ".csv", dataset_blend_train[:, ExecutionIndex]) csv_io.write_delimited_file("../predictions/RunLog.csv", [ now.strftime("%Y %m %d %H %M %S"), str(avg), str(clf), str(NumFolds), model, "", "" ], filemode="a", delimiter=",") print "------------------------Average: ", avg
def run_stack(SEED): model = "base" trainBase = csv_io_np.read_data("PreProcessData/train.csv", skipFirstLine = True, split = ",") test = csv_io_np.read_data("PreProcessData/test.csv", skipFirstLine = True, split = ",") print "Data Read Complete" avg = 0 NumFolds = 5 predicted_list = [] bootstrapLists = [] # 100 producted 94% # 1000 did not finish in about 5+ hours... # 300 about 5 hours, .9691 on first CF # learn_rate=0.01, n_estimators=300, subsample=1.0, min_samples_split=30, 0.9386 # GradientBoostingClassifier(loss='deviance', learn_rate=0.1, n_estimators=300, subsample=1.0, min_samples_split=30, min_samples_leaf=1, max_depth=5, init=None, random_state=None, max_features=None) # Leader board of 98443, for 20th place. #SVC(C=10**6, kernel='rbf', degree=3, gamma=10**-6.35, coef0=0.0, shrinking=True, probability=False, tol=0.001, cache_size=200, class_weight=None, verbose=False) clfs = [ SVC(C=10**6, kernel='rbf', degree=3, gamma=10**-6.35, coef0=0.0, shrinking=True, probability=False, tol=0.001, cache_size=200, class_weight=None, verbose=False) ] print "Data size: ", len(trainBase), len(test) dataset_blend_train = np.zeros((len(trainBase), len(clfs))) dataset_blend_test = np.zeros((len(test), len(clfs))) trainNew = [] trainTestNew = [] testNew = [] trainNewSelect = [] trainTestNewSelect = [] testNewSelect = [] print "Scaling" targetPre = [x[0] for x in trainBase] trainPre = [x[1:] for x in trainBase] testPre = [x[0:] for x in test] # image best restuls [-1, 1] #preprocessing.MinMaxScaler(feature_range=(0, 1), copy=True) #print trainPre[0] #scaler = preprocessing.Scaler().fit(trainPre) #trainScaled = scaler.transform(trainPre) #testScaled = scaler.transform(testPre) trainScaled = trainPre testScaled = testPre #print scaler.mean_ #print scaler.std_ print "Begin Training" lenTrainBase = len(trainBase) trainBase = [] lenTest = len(test) test = [] trainPre = [] testPre = [] gc.collect() for ExecutionIndex, clf in enumerate(clfs): print clf avg = 0 predicted_list = [] dataset_blend_test_set = np.zeros((lenTest, NumFolds)) foldCount = 0 #Stratified for classification...[trainBase[i][0] for i in range(len(trainBase))] Folds = cross_validation.KFold(lenTrainBase, k=NumFolds, indices=True) for train_index, test_index in Folds: #trainBaseTemp = [trainBase[i] for i in train_index] #target = [x[0] for x in trainBaseTemp] #train = [x[1:] for x in trainBaseTemp] #testBaseTemp = [trainBase[i] for i in test_index] #targetTest = [x[0] for x in testBaseTemp] #trainTest = [x[1:] for x in testBaseTemp] #test = [x[0:] for x in test] target = [targetPre[i] for i in train_index] train = [trainScaled[i] for i in train_index] targetTest = [targetPre[i] for i in test_index] trainTest = [trainScaled[i] for i in test_index] print print "Iteration: ", foldCount print "LEN: ", len(train), len(train[0]), len(target), len(trainTest), len(trainTest[0]) print datetime.datetime.now() clf.fit(train, target) print datetime.datetime.now() prob = clf.predict(trainTest) dataset_blend_train[test_index, ExecutionIndex] = prob probSum = 0.0 count = 0.0 for i in range(0, len(prob)): probX = prob[i]#[1] #print probX, targetTest[i] if ( targetTest[i] == probX ) : probSum += 1.0 count = count + 1.0 print "Sum: ", probSum, count print "Score: ", probSum/count avg += (probSum/count)/NumFolds #predicted_probs = clf.predict(testScaled) ######predicted_list.append([x[1] for x in predicted_probs]) #dataset_blend_test_set[:, foldCount] = predicted_probs #[0] foldCount = foldCount + 1 print "Final Train", datetime.datetime.now() clf.fit(trainScaled, targetPre) # must to this for multiclass classification... print "Final Predict", datetime.datetime.now() predicted_probs = clf.predict(testScaled) print "Writing Data", datetime.datetime.now() #dataset_blend_test[:,ExecutionIndex] = dataset_blend_test_set.mean(1) now = datetime.datetime.now() #csv_io.write_delimited_file_single("../predictions/Stack_" + now.strftime("%Y%m%d%H%M%S") + "_" + str(avg) + "_" + str(clf)[:12] + ".csv", dataset_blend_test_set.mean(1)) csv_io.write_delimited_file_single("../predictions/Stack_" + now.strftime("%Y%m%d%H%M%S") + "_" + str(avg) + "_" + str(clf)[:12] + ".csv", predicted_probs) # for multiclass csv_io.write_delimited_file_single("../predictions/Target_Stack_" + now.strftime("%Y%m%d%H%M%S") + "_" + str(avg) + "_" + str(clf)[:12] + ".csv", dataset_blend_train[:,ExecutionIndex] ) csv_io.write_delimited_file("../predictions/RunLog.csv", [now.strftime("%Y %m %d %H %M %S"), "AVG." , str(avg), str(clf), "Folds:", str(NumFolds), "Model", model, "", ""], filemode="a",delimiter=",") print "------------------------Average: ", avg return dataset_blend_train, dataset_blend_test
def run_stack(SEED): train, test = util.get_train_test_df() columns = set(train.columns) columns.remove("SalesID") columns.remove("SalePrice") columns.remove("saledate") train_fea = get_date_dataframe(train["saledate"]) test_fea = get_date_dataframe(test["saledate"]) for col in columns: types = set(type(x) for x in train[col]) if str in types: s = set(x for x in train[col]) str_to_categorical = defaultdict(lambda: -1, [(x[1], x[0]) for x in enumerate(s)]) train_fea = train_fea.join(pd.DataFrame({col: [str_to_categorical[x] for x in train[col]]}, index=train.index)) test_fea = test_fea.join(pd.DataFrame({col: [str_to_categorical[x] for x in test[col]]}, index=test.index)) else: train_fea = train_fea.join(train[col]) test_fea = test_fea.join(test[col]) model = "" print "Running Stack." avg = 0 NumFolds = 5 # 5 is good, but 10 yeilds a better mean since outliers are less significant. #targetX = csv_io.read_data("target.csv", skipFirstLine = False, split = ",") #trainBase = csv_io.read_data("train1.csv", skipFirstLine = False, split = ",") #test = csv_io_np.read_data("test1.csv", skipFirstLine = False, split = ",") #trainBase = trainBase[0:5000] #targetX = targetX[0:5000] #train_saleID = csv_io.read_data("train_salesID.csv", skipFirstLine = False, split = ",") #test_salesID = csv_io.read_data("test_salesID.csv", skipFirstLine = False, split = ",") predicted_list = [] bootstrapLists = [] clfs = [ GradientBoostingRegressor(loss='lad', learn_rate=0.05, subsample=0.5, max_depth=6, n_estimators=3000, random_state=166, min_samples_leaf=1) ] #GradientBoostingRegressor(loss='ls', learn_rate=0.05, subsample=0.5, max_depth=6, n_estimators=300, random_state=166, min_samples_leaf=1) #GradientBoostingRegressor(loss='lad', learn_rate=0.05, subsample=0.5, max_depth=6, n_estimators=3000, random_state=166, min_samples_leaf=1), #GradientBoostingRegressor(loss='huber', learn_rate=0.05, subsample=0.5, max_depth=6, n_estimators=3000, random_state=166, min_samples_leaf=1), #train_fea, train["SalePrice"] print "Data size: ", len(train_fea) , len(test_fea) #dataset_blend_train = np.zeros((len(train_fea), len(clfs))) #dataset_blend_test = np.zeros((len(test), len(clfs))) dataset_blend_test = np.zeros((len(test_fea), len(clfs))) # np.zeros(len(train_fea), len(clfs)) dataset_blend_train = np.zeros((len(train_fea), len(clfs))) print "Begin Training" lenTrainBase = 401125 # len(train_fea) lenTest = 11573 # len(test_fea) gc.collect() for ExecutionIndex, clf in enumerate(clfs): print clf avg = 0 predicted_list = [] dataset_blend_test_set = np.zeros((lenTest, NumFolds)) foldCount = 0 #Stratified for classification...[trainBase[i][0] for i in range(len(trainBase))] Folds = cross_validation.KFold(lenTrainBase, k=NumFolds, indices=True) for train_index, test_index in Folds: targetX = [train["SalePrice"][i] for i in train_index] trainX = [train_fea.ix[i] for i in train_index] targetTest = [train["SalePrice"][i] for i in test_index] trainTest = [train_fea.ix[i] for i in test_index] gc.collect() print print "Iteration: ", foldCount print "LEN: ", len(trainX), len(targetX) #print trainX[0] #print target[0] #return print "Start", datetime.datetime.now() clf.fit(trainX, targetX) prob = clf.predict(trainTest) print "End ", datetime.datetime.now() dataset_blend_train[test_index, ExecutionIndex] = prob gc.collect() probSum = 0 weightSum = 0 # totalOffByHalf = 0 # totalPositive = 0 # totalPositiveOffByHalf = 0 # totalPositivePredictions = 0 for i in range(0, len(prob)): probX = prob[i] #print targetTest[i], probX if probX < 0: # some are comming out negative. probX = -probX probSum += math.pow(math.log10(targetTest[i]) - math.log10(probX), 2) #probSum += weights[test_index[i]][0] * math.fabs(targetTest[i] - probX) #weightSum += weights[test_index[i]][0] #print "Weight", weights[test_index[i]][0], "Index: ",i, "Test_Index: ",test_index[i] , "Actual: ", targetTest[i], "Predicted: ", probX # log loss cal #probSum += int(targetTest[i])*log(probX)+(1-int(targetTest[i]))*log(1-probX) # if ( math.fabs(probX - int(targetTest[i])) > 0.5 ): # totalOffByHalf = totalOffByHalf + 1 # if ( int(targetTest[i]) == 1 ): # totalPositive = totalPositive + 1 # if ( int(targetTest[i]) == 1 and probX < 0.5): # totalPositiveOffByHalf = totalPositiveOffByHalf + 1 # if (probX > 0.5): # totalPositivePredictions = totalPositivePredictions + 1 # print # print "Stats:" # print "Total Off By > 0.5 ", totalOffByHalf # print "Total Positive ", totalPositive # print "Total Positive Off By Half ", totalPositiveOffByHalf # print "Total Positive Predictions ", totalPositivePredictions #print -probSum/len(prob) print "Score: ", math.sqrt(probSum/len(prob)) avg += math.sqrt(probSum/len(prob))/NumFolds gc.collect() predicted_probs = [] for i in range(0,lenTest): predicted_probs.append(clf.predict(test_fea.ix[i])) #predicted_probs = clf.predict(testScaled) #predicted_list.append([x[1] for x in predicted_probs]) dataset_blend_test_set[:, foldCount] = predicted_probs #[0] gc.collect() foldCount = foldCount + 1 dataset_blend_test[:,ExecutionIndex] = dataset_blend_test_set.mean(1) #print "Saving NP" #np.savetxt('temp/dataset_blend_test_set.txt', dataset_blend_test_set) #np.savetxt('temp/dataset_blend_test_set.mean.txt', dataset_blend_test_set.mean(1) ) #np.savetxt('temp/dataset_blend_test.txt', dataset_blend_test) #print "Done Saving NP" now = datetime.datetime.now() #print dataset_blend_test_set.mean(1) csv_io.write_delimited_file_single("../predictions/Stack_" + now.strftime("%Y%m%d%H%M%S") + "_" + str(avg) + "_" + str(clf)[:12] + ".csv", dataset_blend_test_set.mean(1)) csv_io.write_delimited_file_single("../predictions/Target_Stack_" + now.strftime("%Y%m%d%H%M%S") + "_" + str(avg) + "_" + str(clf)[:12] + ".csv", dataset_blend_train[:,ExecutionIndex] ) csv_io.write_delimited_file("../predictions/RunLog.csv", [now.strftime("%Y %m %d %H %M %S"), "AVG." , str(avg), str(clf), "Folds:", str(NumFolds), "Model", model, "", ""], filemode="a",delimiter=",") print "------------------------Average: ", avg #np.savetxt('temp/dataset_blend_train.txt', dataset_blend_train) return dataset_blend_train, dataset_blend_test