Example #1
0
File: blend.py Project: mb16/Kaggle
def Blend():

	trainBase = csv_io.read_data("PreProcessData/training_PreProcess4.csv", skipFirstLine = False, split = "\t")
	weights = csv_io.read_data("PreProcessData/Weights.csv", skipFirstLine = False)
		
	SEED = 448
	#random.seed(SEED)
	#random.shuffle(trainBase)
	
	target = [x[0] for x in trainBase]
	
	dataset_blend_train, dataset_blend_test = stack.run_stack(SEED)
	clfs = [LinearRegression(fit_intercept=True, normalize=False, copy_X=True)
		]
	
	
	# clfs = [LogisticRegression(penalty='l2', dual=False, tol=0.0001, C=1.0, fit_intercept=True, intercept_scaling=1, class_weight=None),
			# LogisticRegression(penalty='l2', dual=False, tol=0.0001, C=0.5, fit_intercept=True, intercept_scaling=1, class_weight=None),
			# LogisticRegression(penalty='l2', dual=False, tol=0.0001, C=0.1, fit_intercept=True, intercept_scaling=1, class_weight=None),
			# LogisticRegression(penalty='l1', dual=False, tol=0.0001, C=1.0, fit_intercept=True, intercept_scaling=1, class_weight=None),
			# LogisticRegression(penalty='l1', dual=False, tol=0.0001, C=0.5, fit_intercept=True, intercept_scaling=1, class_weight=None),
			# LogisticRegression(penalty='l1', dual=False, tol=0.0001, C=0.1, fit_intercept=True, intercept_scaling=1, class_weight=None)]
	
	test = csv_io.read_data("PreProcessData/test_PreProcess4.csv", False)
	dataset_blend_test_set = np.zeros((len(test), len(clfs)))
	
	for ExecutionIndex, clf in enumerate(clfs):

		clf.fit(dataset_blend_train, target)
		submission = clf.predict(dataset_blend_test)
		
		submission = ["%f" % x for x in submission]
		now = datetime.datetime.now()
		csv_io.write_delimited_file("../Submissions/BlendSingle" + now.strftime("%Y%m%d%H%M%S") + ".csv", submission)	
		

		
		# attempt to score the training set to predict score for blend...
		probSum = 0.0
		weightSum = 0
		
		trainPrediction = clf.predict(dataset_blend_train)
		for i in range(0, len(trainPrediction)):
			probX = trainPrediction[i]
			

			probSum += weights[i][0] * math.fabs(target[i] - probX)
			weightSum += weights[i][0]
			#probSum += int(target[i])*log(probX)+(1-int(target[i]))*log(1-probX)
			 
		print "Train Score: ", (probSum/weightSum)
	
		dataset_blend_test_set[:, ExecutionIndex] = submission
	
	
	
	csv_io.write_delimited_file_single("../Submissions/FinalBlend_" + now.strftime("%Y%m%d%H%M%S") + ".csv", dataset_blend_test_set.mean(1))	
Example #2
0
def Blend():

    trainBase = csv_io.read_data("PreProcessData/training_PreProcess4.csv",
                                 skipFirstLine=False,
                                 split="\t")
    weights = csv_io.read_data("PreProcessData/Weights.csv",
                               skipFirstLine=False)

    SEED = 448
    #random.seed(SEED)
    #random.shuffle(trainBase)

    target = [x[0] for x in trainBase]

    dataset_blend_train, dataset_blend_test = stack.run_stack(SEED)
    clfs = [LinearRegression(fit_intercept=True, normalize=False, copy_X=True)]

    # clfs = [LogisticRegression(penalty='l2', dual=False, tol=0.0001, C=1.0, fit_intercept=True, intercept_scaling=1, class_weight=None),
    # LogisticRegression(penalty='l2', dual=False, tol=0.0001, C=0.5, fit_intercept=True, intercept_scaling=1, class_weight=None),
    # LogisticRegression(penalty='l2', dual=False, tol=0.0001, C=0.1, fit_intercept=True, intercept_scaling=1, class_weight=None),
    # LogisticRegression(penalty='l1', dual=False, tol=0.0001, C=1.0, fit_intercept=True, intercept_scaling=1, class_weight=None),
    # LogisticRegression(penalty='l1', dual=False, tol=0.0001, C=0.5, fit_intercept=True, intercept_scaling=1, class_weight=None),
    # LogisticRegression(penalty='l1', dual=False, tol=0.0001, C=0.1, fit_intercept=True, intercept_scaling=1, class_weight=None)]

    test = csv_io.read_data("PreProcessData/test_PreProcess4.csv", False)
    dataset_blend_test_set = np.zeros((len(test), len(clfs)))

    for ExecutionIndex, clf in enumerate(clfs):

        clf.fit(dataset_blend_train, target)
        submission = clf.predict(dataset_blend_test)

        submission = ["%f" % x for x in submission]
        now = datetime.datetime.now()
        csv_io.write_delimited_file(
            "../Submissions/BlendSingle" + now.strftime("%Y%m%d%H%M%S") +
            ".csv", submission)

        # attempt to score the training set to predict score for blend...
        probSum = 0.0
        weightSum = 0

        trainPrediction = clf.predict(dataset_blend_train)
        for i in range(0, len(trainPrediction)):
            probX = trainPrediction[i]

            probSum += weights[i][0] * math.fabs(target[i] - probX)
            weightSum += weights[i][0]
            #probSum += int(target[i])*log(probX)+(1-int(target[i]))*log(1-probX)

        print "Train Score: ", (probSum / weightSum)

        dataset_blend_test_set[:, ExecutionIndex] = submission

    csv_io.write_delimited_file_single(
        "../Submissions/FinalBlend_" + now.strftime("%Y%m%d%H%M%S") + ".csv",
        dataset_blend_test_set.mean(1))
Example #3
0
def run_stack(SEED):

	model = "Long-Lat KNN5 - 50 Features"

	print "Running GB, RF, ET stack."

	trainBase = csv_io.read_data("PreProcessData/training_PreProcess4_50.csv", skipFirstLine = False, split = "\t")
	test = csv_io.read_data("PreProcessData/test_PreProcess4_50.csv", skipFirstLine = False, split = "\t")
	weights = csv_io.read_data("PreProcessData/Weights.csv", skipFirstLine = False)

	
	#random.seed(SEED)
	#random.shuffle(trainBase)
	
	avg = 0
	NumFolds = 5 # 5 is good, but 10 yeilds a better mean since outliers are less significant. (note, predictions are less reliable when using 10).


	predicted_list = []
	bootstrapLists = []

	# use this for quick runs.
	# note RF with 150 crashes on 30 features
	# GradientBoostingRegressor(learn_rate=0.05, subsample=0.5, max_depth=6, n_estimators=50),
	# GradientBoostingRegressor(learn_rate=0.02, subsample=0.2, max_depth=8, n_estimators=125),
	# RandomForestRegressor(n_estimators=100, n_jobs=1),
	#RandomForestRegressor(n_estimators=75, n_jobs=1),
	# clfs = [ExtraTreesRegressor(n_estimators=50, min_density=0.2, n_jobs=1, bootstrap=True, random_state=1),
		# SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0, degree=3, gamma=2**-5.5, kernel='rbf', probability=False, shrinking=True, tol=0.001, verbose=False)
		# ]	
	#knn 5 at 3.45
	#knn 15 at 3.31
	#knn 25 at 3.30
	#knn 40 at 3.31
	# KNeighborsRegressor(n_neighbors=5, weights='uniform', algorithm='auto', leaf_size=30, warn_on_equidistant=False, p=2),
	# KNeighborsRegressor(n_neighbors=15, weights='uniform', algorithm='auto', leaf_size=30, warn_on_equidistant=False, p=2),
	# KNeighborsRegressor(n_neighbors=25, weights='uniform', algorithm='auto', leaf_size=30, warn_on_equidistant=False, p=2),
	# LinearRegression at 3.77
	# Ridge at 3.77
	# SGD 4.23
	#Gauss at 13
	# LinearRegression(fit_intercept=True, normalize=False, copy_X=True),
	# Ridge(alpha=1.0, fit_intercept=True, normalize=False, copy_X=True, tol=0.001),
	# SGDRegressor(loss='squared_loss', penalty='l2', alpha=0.0001, rho=0.84999999999999998, fit_intercept=True, n_iter=5, shuffle=False, verbose=0, epsilon=0.10000000000000001, p=None, seed=0, learning_rate='invscaling', eta0=0.01, power_t=0.25, warm_start=False),
	# GaussianNB()
	# clfs = [KNeighborsRegressor(n_neighbors=15, weights='uniform', algorithm='auto', leaf_size=30, warn_on_equidistant=False, p=2),
		 # KNeighborsRegressor(n_neighbors=25, weights='uniform', algorithm='auto', leaf_size=30, warn_on_equidistant=False, p=2),KNeighborsRegressor(n_neighbors=35, weights='uniform', algorithm='auto', leaf_size=30, warn_on_equidistant=False, p=2)	
		# ]
		
	# GB, 125 est is minimum, score is bad below this, explore higher and other dimensions. ******************
	# clfs = [GradientBoostingRegressor(learn_rate=0.05, subsample=0.5, max_depth=6, n_estimators=100, random_state=166),
			# GradientBoostingRegressor(learn_rate=0.05, subsample=0.5, max_depth=8, n_estimators=100, random_state=166),
			# GradientBoostingRegressor(learn_rate=0.05, subsample=0.5, max_depth=10, n_estimators=100, random_state=166),
			# GradientBoostingRegressor(learn_rate=0.05, subsample=0.5, max_depth=6, n_estimators=200, random_state=166),
			# GradientBoostingRegressor(learn_rate=0.05, subsample=0.5, max_depth=8, n_estimators=200, random_state=166),GradientBoostingRegressor(learn_rate=0.05, subsample=0.5, max_depth=10, n_estimators=200, random_state=166)
			# ]	
			
	# about 1 hour run time, and 3.10 score.		
	#GradientBoostingRegressor(learn_rate=0.05, subsample=0.5, max_depth=6, n_estimators=400, random_state=166)
	# about 2 hours run time at 3.05
	#GradientBoostingRegressor(learn_rate=0.05, subsample=0.5, max_depth=8, n_estimators=400, random_state=166)
	# about 2 hours run time at 3.06
	#GradientBoostingRegressor(learn_rate=0.05, subsample=0.5, max_depth=6, n_estimators=800, random_state=166)
	# about 4 hours run time at 3.06
	#GradientBoostingRegressor(learn_rate=0.05, subsample=0.5, max_depth=8, n_estimators=800, random_state=166)	
	
	clfs = [GradientBoostingRegressor(learn_rate=0.05, subsample=0.5, max_depth=6, n_estimators=3000, random_state=166)
			]		
	
	
		# use this for quick runs.
	# clfs = [GradientBoostingClassifier(learn_rate=0.05, subsample=0.5, max_depth=6, n_estimators=50, random_state=166),
			# GradientBoostingClassifier(learn_rate=0.02, subsample=0.2, max_depth=8, n_estimators=125, random_state=551),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=80, random_state=441),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=3, n_estimators=80, random_state=331),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=4, n_estimators=80, random_state=221),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=120, random_state=91),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=3, n_estimators=120, random_state=81),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=4, n_estimators=120, random_state=71),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=160, random_state=61),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=3, n_estimators=160, random_state=51),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=4, n_estimators=160, random_state=41),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=1, n_estimators=200, random_state=31),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=200, random_state=21),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=3, n_estimators=200, random_state=10),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=4, n_estimators=200, random_state=19),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=1, n_estimators=240, random_state=18),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=240, random_state=17),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=3, n_estimators=240, random_state=16),			
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=1, n_estimators=280, random_state=15),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=280, random_state=14),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=3, n_estimators=280, random_state=13),			
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=1, n_estimators=320, random_state=12),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=320, random_state=11),
			# RandomForestClassifier(n_estimators=100, n_jobs=1, criterion='gini'),
			# RandomForestClassifier(n_estimators=100, n_jobs=1, criterion='entropy'),
			# ExtraTreesClassifier(n_estimators=150, min_density=0.2, n_jobs=1, criterion='gini', bootstrap=True, random_state=1),
			# ExtraTreesClassifier(n_estimators=150, min_density=0.2, n_jobs=1, criterion='entropy', bootstrap=True, random_state=5)]	
	
	
	
	# use this for quick runs.  reduced estimators to 50
	# clfs = [SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0, degree=3,
        # gamma=2**-5.5, kernel='rbf', probability=False, shrinking=True,
        # tol=0.001, verbose=False)
			# ]	
			
	#GradientBoostingRegressor(learn_rate=0.05, subsample=0.5, max_depth=6, n_estimators=50),
	#ExtraTreesRegressor(n_estimators=50, min_density=0.2, n_jobs=1, bootstrap=True, random_state=1)
	
	# clfs = [ExtraTreesClassifier(n_estimators=150, min_density=0.2, n_jobs=1, criterion='gini', bootstrap=True, random_state=1),
			# ExtraTreesClassifier(n_estimators=150, min_density=0.09, n_jobs=1, criterion='gini', bootstrap=True, random_state=2),
			# ExtraTreesClassifier(n_estimators=150, min_density=0.2, n_jobs=1, criterion='entropy', bootstrap=True, random_state=5),
			# ExtraTreesClassifier(n_estimators=150, min_density=0.09, n_jobs=1, criterion='entropy', bootstrap=True, random_state=6),
			# GradientBoostingClassifier(learn_rate=0.05, subsample=0.5, max_depth=6, n_estimators=50),
			# GradientBoostingClassifier(learn_rate=0.02, subsample=0.2, max_depth=8, n_estimators=125),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=80),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=3, n_estimators=80),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=4, n_estimators=80),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=120),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=3, n_estimators=120),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=4, n_estimators=120),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=160),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=3, n_estimators=160),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=4, n_estimators=160),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=1, n_estimators=200),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=200),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=3, n_estimators=200),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=4, n_estimators=200),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=1, n_estimators=240),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=240),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=3, n_estimators=240),			
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=1, n_estimators=280),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=280),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=3, n_estimators=280),			
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=1, n_estimators=320),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=320),
			# RandomForestClassifier(n_estimators=150, min_density=0.2, n_jobs=1, criterion='gini', bootstrap=True, random_state=1),
			# RandomForestClassifier(n_estimators=150, min_density=0.09, n_jobs=1, criterion='gini', bootstrap=True, random_state=2),
			# RandomForestClassifier(n_estimators=150, min_density=0.2, n_jobs=1, criterion='entropy', bootstrap=True, random_state=5),
			# RandomForestClassifier(n_estimators=150, min_density=0.05, n_jobs=1, criterion='entropy', bootstrap=True, random_state=7)]
			
			
	# full algorithm stack.
	# clfs = [ExtraTreesClassifier(n_estimators=150, min_density=0.2, n_jobs=1, criterion='gini', bootstrap=True, random_state=1),
			# ExtraTreesClassifier(n_estimators=150, min_density=0.09, n_jobs=1, criterion='gini', bootstrap=True, random_state=2),
			# ExtraTreesClassifier(n_estimators=150, min_density=0.05, n_jobs=1, criterion='gini', bootstrap=True, random_state=3),
			# ExtraTreesClassifier(n_estimators=150, min_density=0.02, n_jobs=1, criterion='gini', bootstrap=True, random_state=4),
			# ExtraTreesClassifier(n_estimators=150, min_density=0.2, n_jobs=1, criterion='entropy', bootstrap=True, random_state=5),
			# ExtraTreesClassifier(n_estimators=150, min_density=0.09, n_jobs=1, criterion='entropy', bootstrap=True, random_state=6),
			# ExtraTreesClassifier(n_estimators=150, min_density=0.05, n_jobs=1, criterion='entropy', bootstrap=True, random_state=7),
			# ExtraTreesClassifier(n_estimators=150, min_density=0.02, n_jobs=1, criterion='entropy', bootstrap=True, random_state=8),
			# GradientBoostingClassifier(learn_rate=0.05, subsample=0.5, max_depth=6, n_estimators=50),
			# GradientBoostingClassifier(learn_rate=0.02, subsample=0.2, max_depth=8, n_estimators=125),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=80),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=3, n_estimators=80),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=4, n_estimators=80),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=120),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=3, n_estimators=120),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=4, n_estimators=120),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=160),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=3, n_estimators=160),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=4, n_estimators=160),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=1, n_estimators=200),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=200),
			# GradientBoostingClassifier(lesarn_rate=0.01, subsample=0.2, max_depth=3, n_estimators=200),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=4, n_estimators=200),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=1, n_estimators=240),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=240),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=3, n_estimators=240),			
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=1, n_estimators=280),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=280),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=3, n_estimators=280),			
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=1, n_estimators=320),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=320),
			# RandomForestClassifier(n_estimators=150, min_density=0.2, n_jobs=1, criterion='gini', bootstrap=True, random_state=1),
			# RandomForestClassifier(n_estimators=150, min_density=0.09, n_jobs=1, criterion='gini', bootstrap=True, random_state=2),
			# RandomForestClassifier(n_estimators=150, min_density=0.05, n_jobs=1, criterion='gini', bootstrap=True, random_state=3),
			# RandomForestClassifier(n_estimators=150, min_density=0.02, n_jobs=1, criterion='gini', bootstrap=True, random_state=4),
			# RandomForestClassifier(n_estimators=150, min_density=0.2, n_jobs=1, criterion='entropy', bootstrap=True, random_state=5),
			# RandomForestClassifier(n_estimators=150, min_density=0.09, n_jobs=1, criterion='entropy', bootstrap=True, random_state=6),
			# RandomForestClassifier(n_estimators=150, min_density=0.05, n_jobs=1, criterion='entropy', bootstrap=True, random_state=7),
			# RandomForestClassifier(n_estimators=150, min_density=0.02, n_jobs=1, criterion='entropy', bootstrap=True, random_state=8)]
	

	
	print "Data size: ", len(trainBase), len(test)
	dataset_blend_train = np.zeros((len(trainBase), len(clfs)))
	dataset_blend_test = np.zeros((len(test), len(clfs)))
	

	trainNew = []
	trainTestNew = []
	testNew = []
	trainNewSelect = []
	trainTestNewSelect = []
	testNewSelect = []
	
	print "Scaling"
	targetPre = [x[0] for x in trainBase]
	trainPre = [x[1:] for x in trainBase]
	testPre = [x[0:] for x in test]
	#print trainPre[0]
	scaler = preprocessing.Scaler().fit(trainPre)
	trainScaled = scaler.transform(trainPre)
	testScaled = scaler.transform(testPre)	

	#print scaler.mean_
	#print scaler.std_
	print "Begin Training"
	
	
	for ExecutionIndex, clf in enumerate(clfs):
		print str(clf)
		avg = 0
	
		predicted_list = []
			
		dataset_blend_test_set = np.zeros((len(test), NumFolds))

		
		foldCount = 0

		
		#Stratified for classification...[trainBase[i][0] for i in range(len(trainBase))]
		Folds = cross_validation.KFold(len(trainBase), k=NumFolds, indices=True)
		for train_index, test_index in Folds:

			#trainBaseTemp = [trainBase[i] for i in train_index]
			#target = [x[0] for x in trainBaseTemp]
			#train = [x[1:] for x in trainBaseTemp]
	
			#testBaseTemp = [trainBase[i] for i in test_index]
			#targetTest = [x[0] for x in testBaseTemp]
			#trainTest = [x[1:] for x in testBaseTemp]
		
			#test = [x[0:] for x in test]
	
			target = [targetPre[i] for i in train_index]
			train = [trainScaled[i] for i in train_index]
			
			targetTest = [targetPre[i] for i in test_index]	
			trainTest = [trainScaled[i] for i in test_index]	
	
			print
			print "Iteration: ", foldCount
			print "LEN: ", len(train), len(target)
			
			clf.fit(train, target)
			prob = clf.predict(trainTest) 
			
			dataset_blend_train[test_index, ExecutionIndex] = prob



	
			probSum = 0
			weightSum = 0
			# totalOffByHalf = 0
			# totalPositive = 0
			# totalPositiveOffByHalf = 0
			# totalPositivePredictions = 0
			
			for i in range(0, len(prob)):
				probX = prob[i]

				probSum += weights[test_index[i]][0] * math.fabs(targetTest[i] - probX)
				weightSum += weights[test_index[i]][0] 
				#print "Weight", weights[test_index[i]][0], "Index: ",i, "Test_Index: ",test_index[i] , "Actual: ", targetTest[i], "Predicted: ", probX
				
				# log loss cal
				#probSum += int(targetTest[i])*log(probX)+(1-int(targetTest[i]))*log(1-probX)
				# if ( math.fabs(probX - int(targetTest[i])) > 0.5 ):
					# totalOffByHalf = totalOffByHalf + 1		
			
				# if ( int(targetTest[i]) == 1 ):
					# totalPositive = totalPositive + 1
				# if ( int(targetTest[i]) == 1 and probX < 0.5):
					# totalPositiveOffByHalf = totalPositiveOffByHalf + 1
				# if (probX > 0.5):
					# totalPositivePredictions = totalPositivePredictions + 1			
			
			# print
			# print "Stats:"
			# print "Total Off By > 0.5 ", totalOffByHalf
			# print "Total Positive ", totalPositive
			# print "Total Positive Off By Half ", totalPositiveOffByHalf
			# print "Total Positive Predictions ", totalPositivePredictions
			#print -probSum/len(prob)
			print "Score: ", probSum/weightSum
 
			avg += 	(probSum/weightSum)/NumFolds

			predicted_probs = clf.predict(testScaled) 	
			#predicted_list.append([x[1] for x in predicted_probs])	
			dataset_blend_test_set[:, foldCount] = predicted_probs #[0]
		
				
			foldCount = foldCount + 1
		
		dataset_blend_test[:,ExecutionIndex] = dataset_blend_test_set.mean(1)  
		
		#print "Saving NP"
		#np.savetxt('temp/dataset_blend_test_set.txt', dataset_blend_test_set)
		#np.savetxt('temp/dataset_blend_test_set.mean.txt', dataset_blend_test_set.mean(1) )
		#np.savetxt('temp/dataset_blend_test.txt', dataset_blend_test)
		#print "Done Saving NP"
		
		now = datetime.datetime.now()
		#print dataset_blend_test_set.mean(1) 
		csv_io.write_delimited_file_single("../predictions_50/Stack_" + now.strftime("%Y%m%d%H%M%S") + "_" + str(avg) + "_" + str(clf)[:12] + ".csv", dataset_blend_test_set.mean(1))
		
		csv_io.write_delimited_file_single("../predictions_50/Target_Stack_" + now.strftime("%Y%m%d%H%M%S") + "_" + str(avg) + "_" + str(clf)[:12] + ".csv", dataset_blend_train[:,ExecutionIndex] )		
		
		csv_io.write_delimited_file("../predictions_40/RunLog.csv", [now.strftime("%Y %m %d %H %M %S"), str(avg), str(clf), str(NumFolds), model, "", ""], filemode="a",delimiter=",")
		
		
		print now
		print "------------------------Average: ", avg

		#np.savetxt('temp/dataset_blend_train.txt', dataset_blend_train)

	return dataset_blend_train, dataset_blend_test
Example #4
0
		print "Score: ", auc
			

		avg += 	auc/NumFolds

		predicted_probs = clf.predict_proba(finalTestSparse) 	
		#predicted_list.append([x[1] for x in predicted_probs])	
		dataset_blend_test_set[:, foldCount] = predicted_probs[:,1]
		
				
		foldCount = foldCount + 1
		
		#break	
		
	dataset_blend_test[:,ExecutionIndex] = dataset_blend_test_set.mean(1)  
		

		
	now = datetime.datetime.now()

	csv_io.write_delimited_file_single_plus_index("../predictions/Stack_" + now.strftime("%Y%m%d%H%M%S") + "_" + str(avg) + "_" + str(clf)[:12] + ".csv", dataset_blend_test_set.mean(1))
	csv_io.write_delimited_file_single("../predictions/Target_Stack_" + now.strftime("%Y%m%d%H%M%S") + "_" + str(avg) + "_" + str(clf)[:12] + ".csv", dataset_blend_train[:,ExecutionIndex] )		
	csv_io.write_delimited_file("../predictions/RunLog.csv", [now.strftime("%Y %m %d %H %M %S"), str(avg), str(clf), str(NumFolds), model, "", ""], filemode="a",delimiter=",")
		
		
	print "------------------------Average: ", avg




Example #5
0
def run_stack(SEED):

	model = "" 

	print "Running Stack."

	
	avg = 0
	NumFolds = 5 # 5 is good, but 10 yeilds a better mean since outliers are less significant. 

	targetX = csv_io.read_data("target.csv", skipFirstLine = False, split = ",")
	
	trainBase = csv_io.read_data("train1.csv", skipFirstLine = False, split = ",")	
	#test = csv_io_np.read_data("test1.csv", skipFirstLine = False, split = ",")

	trainBase = trainBase[0:5000]
	targetX = targetX[0:5000]
	
	train_saleID = csv_io.read_data("train_salesID.csv", skipFirstLine = False, split = ",")
	test_salesID = csv_io.read_data("test_salesID.csv", skipFirstLine = False, split = ",")
	

	predicted_list = []
	bootstrapLists = []


	clfs = [
		GradientBoostingRegressor(loss='ls', learn_rate=0.05, subsample=0.5, max_depth=6, n_estimators=300, random_state=166, min_samples_leaf=1)		
	]		
	#GradientBoostingRegressor(loss='ls', learn_rate=0.05, subsample=0.5, max_depth=6, n_estimators=300, random_state=166, min_samples_leaf=1)	
	#GradientBoostingRegressor(loss='lad', learn_rate=0.05, subsample=0.5, max_depth=6, n_estimators=3000, random_state=166, min_samples_leaf=1),	
	#GradientBoostingRegressor(loss='huber', learn_rate=0.05, subsample=0.5, max_depth=6, n_estimators=3000, random_state=166, min_samples_leaf=1),	
	

	
	print "Data size: ", len(trainBase) , 11573 # len(test)
	dataset_blend_train = np.zeros((len(trainBase), len(clfs)))
	#dataset_blend_test = np.zeros((len(test), len(clfs)))
	dataset_blend_test = np.zeros(11573, len(clfs))	
	
	#targetPre = target #[0:5000]
	#testScaled = test
	#trainScaled = trainBase #[0:5000]

	#targetPre = target #[0:5000]
	#testScaled = test
	#trainScaled = trainBase #[0:5000]
	
	
	print "Begin Training"

	lenTrainBase = len(trainBase)
	#lenTrainBase = len(trainBase[0:5000])


	lenTest = 11573
	#lenTest = len(test)

	
	
	gc.collect()
	
	for ExecutionIndex, clf in enumerate(clfs):
		print clf
		avg = 0
	
		predicted_list = []
			
		dataset_blend_test_set = np.zeros((lenTest, NumFolds))

		
		foldCount = 0

		
		#Stratified for classification...[trainBase[i][0] for i in range(len(trainBase))]
		Folds = cross_validation.KFold(lenTrainBase, k=NumFolds, indices=True)
			
		
		
		for train_index, test_index in Folds:

			target = [targetX[i] for i in train_index]
			train = [trainBase[i] for i in train_index]
			
			targetTest = [targetX[i] for i in test_index]	
			trainTest = [trainBase[i] for i in test_index]
			
			#target = [targetPre[i] for i in train_index]
			#train = [trainScaled[i] for i in train_index]
			
			#targetTest = [targetPre[i] for i in test_index]	
			#trainTest = [trainScaled[i] for i in test_index]	
	
			gc.collect()
			print
			print "Iteration: ", foldCount
			print "LEN: ", len(train), len(target)
			
			#print train[0]
			#print target[0]
			#return
			
			print "Start", datetime.datetime.now()
			clf.fit(train, target)
			prob = clf.predict(trainTest) 
			print "End  ", datetime.datetime.now()
			
			dataset_blend_train[test_index, ExecutionIndex] = prob

			gc.collect()

	
			probSum = 0
			weightSum = 0
			# totalOffByHalf = 0
			# totalPositive = 0
			# totalPositiveOffByHalf = 0
			# totalPositivePredictions = 0
			
			for i in range(0, len(prob)):
				probX = prob[i]
				probX = 31100.0
				print targetTest[i][0], probX
				probSum += math.pow(math.log10(targetTest[i][0]) - math.log10(probX), 2)
				
				#probSum += weights[test_index[i]][0] * math.fabs(targetTest[i] - probX)
				#weightSum += weights[test_index[i]][0] 
				
				
				#print "Weight", weights[test_index[i]][0], "Index: ",i, "Test_Index: ",test_index[i] , "Actual: ", targetTest[i], "Predicted: ", probX
				
				# log loss cal
				#probSum += int(targetTest[i])*log(probX)+(1-int(targetTest[i]))*log(1-probX)
				# if ( math.fabs(probX - int(targetTest[i])) > 0.5 ):
					# totalOffByHalf = totalOffByHalf + 1		
			
				# if ( int(targetTest[i]) == 1 ):
					# totalPositive = totalPositive + 1
				# if ( int(targetTest[i]) == 1 and probX < 0.5):
					# totalPositiveOffByHalf = totalPositiveOffByHalf + 1
				# if (probX > 0.5):
					# totalPositivePredictions = totalPositivePredictions + 1			
			
			# print
			# print "Stats:"
			# print "Total Off By > 0.5 ", totalOffByHalf
			# print "Total Positive ", totalPositive
			# print "Total Positive Off By Half ", totalPositiveOffByHalf
			# print "Total Positive Predictions ", totalPositivePredictions
			#print -probSum/len(prob)
			print "Score: ", math.sqrt(probSum/len(prob))
 
			avg += 	math.sqrt(probSum/len(prob))/NumFolds

			gc.collect()
			
			fo = open("test1.csv", "r")			
			predicted_probs = []
			
			for line in fo:
				line = line.strip().split(",")
				newRow = []		
				for item in line:
					newRow.append(float(item))
					
				predicted_probs.append(clf.predict(newRow))
				
			fo.close()
			
			#predicted_probs = clf.predict(testScaled) 	
			#predicted_list.append([x[1] for x in predicted_probs])	
			dataset_blend_test_set[:, foldCount] = predicted_probs #[0]
			gc.collect()
				
			foldCount = foldCount + 1
		
		dataset_blend_test[:,ExecutionIndex] = dataset_blend_test_set.mean(1)  
		
		#print "Saving NP"
		#np.savetxt('temp/dataset_blend_test_set.txt', dataset_blend_test_set)
		#np.savetxt('temp/dataset_blend_test_set.mean.txt', dataset_blend_test_set.mean(1) )
		#np.savetxt('temp/dataset_blend_test.txt', dataset_blend_test)
		#print "Done Saving NP"
		
		now = datetime.datetime.now()
		#print dataset_blend_test_set.mean(1) 
		csv_io.write_delimited_file_single("../predictions/Stack_" + now.strftime("%Y%m%d%H%M%S") + "_" + str(avg) + "_" + str(clf)[:12] + ".csv", dataset_blend_test_set.mean(1))
		
		csv_io.write_delimited_file_single("../predictions/Target_Stack_" + now.strftime("%Y%m%d%H%M%S") + "_" + str(avg) + "_" + str(clf)[:12] + ".csv", dataset_blend_train[:,ExecutionIndex] )		
		
		csv_io.write_delimited_file("../predictions/RunLog.csv", [now.strftime("%Y %m %d %H %M %S"), "AVG." , str(avg), str(clf), "Folds:", str(NumFolds), "Model", model, "", ""], filemode="a",delimiter=",")
		
		
		print "------------------------Average: ", avg

		#np.savetxt('temp/dataset_blend_train.txt', dataset_blend_train)

	return dataset_blend_train, dataset_blend_test
Example #6
0
def run_stack(SEED):

    train, test = util.get_train_test_df()

    columns = set(train.columns)
    columns.remove("SalesID")
    columns.remove("SalePrice")
    columns.remove("saledate")

    train_fea = get_date_dataframe(train["saledate"])
    test_fea = get_date_dataframe(test["saledate"])

    for col in columns:
        types = set(type(x) for x in train[col])
        if str in types:
            s = set(x for x in train[col])
            str_to_categorical = defaultdict(lambda: -1,
                                             [(x[1], x[0])
                                              for x in enumerate(s)])
            train_fea = train_fea.join(
                pd.DataFrame(
                    {col: [str_to_categorical[x] for x in train[col]]},
                    index=train.index))
            test_fea = test_fea.join(
                pd.DataFrame({col: [str_to_categorical[x] for x in test[col]]},
                             index=test.index))
        else:
            train_fea = train_fea.join(train[col])
            test_fea = test_fea.join(test[col])

    model = ""
    print "Running Stack."

    avg = 0
    NumFolds = 5  # 5 is good, but 10 yeilds a better mean since outliers are less significant.

    #targetX = csv_io.read_data("target.csv", skipFirstLine = False, split = ",")
    #trainBase = csv_io.read_data("train1.csv", skipFirstLine = False, split = ",")
    #test = csv_io_np.read_data("test1.csv", skipFirstLine = False, split = ",")

    #trainBase = trainBase[0:5000]
    #targetX = targetX[0:5000]

    #train_saleID = csv_io.read_data("train_salesID.csv", skipFirstLine = False, split = ",")
    #test_salesID = csv_io.read_data("test_salesID.csv", skipFirstLine = False, split = ",")

    predicted_list = []
    bootstrapLists = []

    clfs = [
        GradientBoostingRegressor(loss='lad',
                                  learn_rate=0.05,
                                  subsample=0.5,
                                  max_depth=6,
                                  n_estimators=3000,
                                  random_state=166,
                                  min_samples_leaf=1)
    ]
    #GradientBoostingRegressor(loss='ls', learn_rate=0.05, subsample=0.5, max_depth=6, n_estimators=300, random_state=166, min_samples_leaf=1)
    #GradientBoostingRegressor(loss='lad', learn_rate=0.05, subsample=0.5, max_depth=6, n_estimators=3000, random_state=166, min_samples_leaf=1),
    #GradientBoostingRegressor(loss='huber', learn_rate=0.05, subsample=0.5, max_depth=6, n_estimators=3000, random_state=166, min_samples_leaf=1),

    #train_fea, train["SalePrice"]
    print "Data size: ", len(train_fea), len(test_fea)
    #dataset_blend_train = np.zeros((len(train_fea), len(clfs)))
    #dataset_blend_test = np.zeros((len(test), len(clfs)))
    dataset_blend_test = np.zeros(
        (len(test_fea), len(clfs)))  # np.zeros(len(train_fea), len(clfs))
    dataset_blend_train = np.zeros((len(train_fea), len(clfs)))

    print "Begin Training"

    lenTrainBase = 401125  # len(train_fea)

    lenTest = 11573  # len(test_fea)
    gc.collect()

    for ExecutionIndex, clf in enumerate(clfs):
        print clf
        avg = 0

        predicted_list = []

        dataset_blend_test_set = np.zeros((lenTest, NumFolds))

        foldCount = 0

        #Stratified for classification...[trainBase[i][0] for i in range(len(trainBase))]
        Folds = cross_validation.KFold(lenTrainBase, k=NumFolds, indices=True)

        for train_index, test_index in Folds:

            targetX = [train["SalePrice"][i] for i in train_index]
            trainX = [train_fea.ix[i] for i in train_index]

            targetTest = [train["SalePrice"][i] for i in test_index]
            trainTest = [train_fea.ix[i] for i in test_index]

            gc.collect()
            print
            print "Iteration: ", foldCount
            print "LEN: ", len(trainX), len(targetX)

            #print trainX[0]
            #print target[0]
            #return

            print "Start", datetime.datetime.now()
            clf.fit(trainX, targetX)
            prob = clf.predict(trainTest)
            print "End  ", datetime.datetime.now()

            dataset_blend_train[test_index, ExecutionIndex] = prob

            gc.collect()

            probSum = 0
            weightSum = 0
            # totalOffByHalf = 0
            # totalPositive = 0
            # totalPositiveOffByHalf = 0
            # totalPositivePredictions = 0

            for i in range(0, len(prob)):
                probX = prob[i]
                #print targetTest[i], probX

                if probX < 0:  # some are comming out negative.
                    probX = -probX

                probSum += math.pow(
                    math.log10(targetTest[i]) - math.log10(probX), 2)

                #probSum += weights[test_index[i]][0] * math.fabs(targetTest[i] - probX)
                #weightSum += weights[test_index[i]][0]

                #print "Weight", weights[test_index[i]][0], "Index: ",i, "Test_Index: ",test_index[i] , "Actual: ", targetTest[i], "Predicted: ", probX

                # log loss cal
                #probSum += int(targetTest[i])*log(probX)+(1-int(targetTest[i]))*log(1-probX)
                # if ( math.fabs(probX - int(targetTest[i])) > 0.5 ):
                # totalOffByHalf = totalOffByHalf + 1

                # if ( int(targetTest[i]) == 1 ):
                # totalPositive = totalPositive + 1
                # if ( int(targetTest[i]) == 1 and probX < 0.5):
                # totalPositiveOffByHalf = totalPositiveOffByHalf + 1
                # if (probX > 0.5):
                # totalPositivePredictions = totalPositivePredictions + 1

            # print
            # print "Stats:"
            # print "Total Off By > 0.5 ", totalOffByHalf
            # print "Total Positive ", totalPositive
            # print "Total Positive Off By Half ", totalPositiveOffByHalf
            # print "Total Positive Predictions ", totalPositivePredictions
            #print -probSum/len(prob)
            print "Score: ", math.sqrt(probSum / len(prob))

            avg += math.sqrt(probSum / len(prob)) / NumFolds

            gc.collect()

            predicted_probs = []

            for i in range(0, lenTest):
                predicted_probs.append(clf.predict(test_fea.ix[i]))

            #predicted_probs = clf.predict(testScaled)
            #predicted_list.append([x[1] for x in predicted_probs])
            dataset_blend_test_set[:, foldCount] = predicted_probs  #[0]
            gc.collect()

            foldCount = foldCount + 1

        dataset_blend_test[:, ExecutionIndex] = dataset_blend_test_set.mean(1)

        #print "Saving NP"
        #np.savetxt('temp/dataset_blend_test_set.txt', dataset_blend_test_set)
        #np.savetxt('temp/dataset_blend_test_set.mean.txt', dataset_blend_test_set.mean(1) )
        #np.savetxt('temp/dataset_blend_test.txt', dataset_blend_test)
        #print "Done Saving NP"

        now = datetime.datetime.now()
        #print dataset_blend_test_set.mean(1)
        csv_io.write_delimited_file_single(
            "../predictions/Stack_" + now.strftime("%Y%m%d%H%M%S") + "_" +
            str(avg) + "_" + str(clf)[:12] + ".csv",
            dataset_blend_test_set.mean(1))

        csv_io.write_delimited_file_single(
            "../predictions/Target_Stack_" + now.strftime("%Y%m%d%H%M%S") +
            "_" + str(avg) + "_" + str(clf)[:12] + ".csv",
            dataset_blend_train[:, ExecutionIndex])

        csv_io.write_delimited_file("../predictions/RunLog.csv", [
            now.strftime("%Y %m %d %H %M %S"), "AVG.",
            str(avg),
            str(clf), "Folds:",
            str(NumFolds), "Model", model, "", ""
        ],
                                    filemode="a",
                                    delimiter=",")

        print "------------------------Average: ", avg

        #np.savetxt('temp/dataset_blend_train.txt', dataset_blend_train)

    return dataset_blend_train, dataset_blend_test
Example #7
0
def run_stack(SEED):

	model = ""

	print "Running GB, RF, ET stack."

	trainBase = csv_io.read_data("../train.csv", skipFirstLine = True, split = ",")
	test = csv_io.read_data("../test.csv", skipFirstLine = True, split = ",")


	
	avg = 0
	NumFolds = 5 # 5 is good, but 10 yeilds a better mean since outliers are less significant. (note, predictions are less reliable when using 10).


	predicted_list = []
	bootstrapLists = []

	# use this for quick runs.
	# note RF with 150 crashes on 30 features
	# GradientBoostingRegressor(learn_rate=0.05, subsample=0.5, max_depth=6, n_estimators=50),
	# GradientBoostingRegressor(learn_rate=0.02, subsample=0.2, max_depth=8, n_estimators=125),
	# RandomForestRegressor(n_estimators=100, n_jobs=1),
	#RandomForestRegressor(n_estimators=75, n_jobs=1),
	# clfs = [ExtraTreesRegressor(n_estimators=50, min_density=0.2, n_jobs=1, bootstrap=True, random_state=1),
		# SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0, degree=3, gamma=2**-5.5, kernel='rbf', probability=False, shrinking=True, tol=0.001, verbose=False)
		# ]	
	#knn 5 at 3.45
	#knn 15 at 3.31
	#knn 25 at 3.30
	#knn 40 at 3.31
	# KNeighborsRegressor(n_neighbors=5, weights='uniform', algorithm='auto', leaf_size=30, warn_on_equidistant=False, p=2),
	# KNeighborsRegressor(n_neighbors=15, weights='uniform', algorithm='auto', leaf_size=30, warn_on_equidistant=False, p=2),
	# KNeighborsRegressor(n_neighbors=25, weights='uniform', algorithm='auto', leaf_size=30, warn_on_equidistant=False, p=2),
	# LinearRegression at 3.77
	# Ridge at 3.77
	# SGD 4.23
	#Gauss at 13
	# LinearRegression(fit_intercept=True, normalize=False, copy_X=True),
	# Ridge(alpha=1.0, fit_intercept=True, normalize=False, copy_X=True, tol=0.001),
	# SGDRegressor(loss='squared_loss', penalty='l2', alpha=0.0001, rho=0.84999999999999998, fit_intercept=True, n_iter=5, shuffle=False, verbose=0, epsilon=0.10000000000000001, p=None, seed=0, learning_rate='invscaling', eta0=0.01, power_t=0.25, warm_start=False),
	# GaussianNB()
	# clfs = [KNeighborsRegressor(n_neighbors=15, weights='uniform', algorithm='auto', leaf_size=30, warn_on_equidistant=False, p=2),
		 # KNeighborsRegressor(n_neighbors=25, weights='uniform', algorithm='auto', leaf_size=30, warn_on_equidistant=False, p=2),KNeighborsRegressor(n_neighbors=35, weights='uniform', algorithm='auto', leaf_size=30, warn_on_equidistant=False, p=2)	
		# ]
		
	# GB, 125 est is minimum, score is bad below this, explore higher and other dimensions. ******************
	# clfs = [GradientBoostingRegressor(learn_rate=0.05, subsample=0.5, max_depth=6, n_estimators=100, random_state=166),
			# GradientBoostingRegressor(learn_rate=0.05, subsample=0.5, max_depth=8, n_estimators=100, random_state=166),
			# GradientBoostingRegressor(learn_rate=0.05, subsample=0.5, max_depth=10, n_estimators=100, random_state=166),
			# GradientBoostingRegressor(learn_rate=0.05, subsample=0.5, max_depth=6, n_estimators=200, random_state=166),
			# GradientBoostingRegressor(learn_rate=0.05, subsample=0.5, max_depth=8, n_estimators=200, random_state=166),GradientBoostingRegressor(learn_rate=0.05, subsample=0.5, max_depth=10, n_estimators=200, random_state=166)
			# ]	
			
	# about 1 hour run time, and 3.10 score.		
	#GradientBoostingRegressor(learn_rate=0.05, subsample=0.5, max_depth=6, n_estimators=400, random_state=166)
	# about 2 hours run time at 3.05
	#GradientBoostingRegressor(learn_rate=0.05, subsample=0.5, max_depth=8, n_estimators=400, random_state=166)
	# about 2 hours run time at 3.06
	#GradientBoostingRegressor(learn_rate=0.05, subsample=0.5, max_depth=6, n_estimators=800, random_state=166)
	# about 4 hours run time at 3.06
	#GradientBoostingRegressor(learn_rate=0.05, subsample=0.5, max_depth=8, n_estimators=800, random_state=166)	
	#SGDClassifier(loss='hinge', penalty='l2', alpha=0.0001, l1_ratio=0.15, fit_intercept=True, n_iter=5, shuffle=False, verbose=0, epsilon=0.1, n_jobs=1, random_state=None, learning_rate='optimal', eta0=0.0, power_t=0.5, class_weight=None, warm_start=False, rho=None, seed=None)
	
	# http://stackoverflow.com/questions/15150339/python-memory-error-sklearn-huge-input-data
	#For high dimensional sparse data and many samples, LinearSVC, LogisticRegression, 
	# PassiveAggressiveClassifier or SGDClassifier can be much faster to train for comparable predictive accuracy.
	
	# LogisticRegression(penalty='l2', dual=False, tol=0.0001, C=1.0, fit_intercept=True, intercept_scaling=1, class_weight=None, random_state=None)
	# LinearSVC(penalty='l2', loss='l2', dual=True, tol=0.0001, C=1.0, multi_class='ovr', fit_intercept=True, intercept_scaling=1, class_weight=None, verbose=0, random_state=None)
	# PassiveAggressiveClassifier(C=1.0, fit_intercept=True, n_iter=5, shuffle=False, verbose=0, loss='hinge', n_jobs=1, random_state=None, warm_start=False)
	# SGDClassifier(loss='hinge', penalty='l2', alpha=0.0001, l1_ratio=0.15, fit_intercept=True, n_iter=5, shuffle=False, verbose=0, epsilon=0.1, n_jobs=1, random_state=None, learning_rate='optimal', eta0=0.0, power_t=0.5, class_weight=None, warm_start=False, rho=None, seed=None)
	
	clfs = [RandomForestClassifier(n_estimators=500, n_jobs=1, criterion='gini')
	] 	

	# best SVC(C=1000000.0, kernel='rbf', degree=3, gamma=0.0, coef0=0.0, shrinking=True, probability=False, tol=0.001, cache_size=200, class_weight=None, verbose=False, max_iter=-1),
	# best LogisticRegression(penalty='l2', dual=False, tol=0.0001, C=1000.0, fit_intercept=True, intercept_scaling=1, class_weight=None, random_state=None),

	
	#SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0, degree=3,gamma=0.0, kernel='rbf', max_iter=-1, probability=False, shrinking=True,tol=0.001, verbose=False)
		# use this for quick runs.
	# clfs = [GradientBoostingClassifier(learn_rate=0.05, subsample=0.5, max_depth=6, n_estimators=50, random_state=166),
			# GradientBoostingClassifier(learn_rate=0.02, subsample=0.2, max_depth=8, n_estimators=125, random_state=551),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=80, random_state=441),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=3, n_estimators=80, random_state=331),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=4, n_estimators=80, random_state=221),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=120, random_state=91),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=3, n_estimators=120, random_state=81),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=4, n_estimators=120, random_state=71),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=160, random_state=61),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=3, n_estimators=160, random_state=51),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=4, n_estimators=160, random_state=41),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=1, n_estimators=200, random_state=31),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=200, random_state=21),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=3, n_estimators=200, random_state=10),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=4, n_estimators=200, random_state=19),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=1, n_estimators=240, random_state=18),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=240, random_state=17),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=3, n_estimators=240, random_state=16),			
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=1, n_estimators=280, random_state=15),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=280, random_state=14),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=3, n_estimators=280, random_state=13),			
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=1, n_estimators=320, random_state=12),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=320, random_state=11),
			# RandomForestClassifier(n_estimators=100, n_jobs=1, criterion='gini'),
			# RandomForestClassifier(n_estimators=100, n_jobs=1, criterion='entropy'),
			# ExtraTreesClassifier(n_estimators=150, min_density=0.2, n_jobs=1, criterion='gini', bootstrap=True, random_state=1),
			# ExtraTreesClassifier(n_estimators=150, min_density=0.2, n_jobs=1, criterion='entropy', bootstrap=True, random_state=5)]	
	
	
	
	# use this for quick runs.  reduced estimators to 50
	# clfs = [SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0, degree=3,
        # gamma=2**-5.5, kernel='rbf', probability=False, shrinking=True,
        # tol=0.001, verbose=False)
			# ]	
			
	#GradientBoostingRegressor(learn_rate=0.05, subsample=0.5, max_depth=6, n_estimators=50),
	#ExtraTreesRegressor(n_estimators=50, min_density=0.2, n_jobs=1, bootstrap=True, random_state=1)
	
	# clfs = [ExtraTreesClassifier(n_estimators=150, min_density=0.2, n_jobs=1, criterion='gini', bootstrap=True, random_state=1),
			# ExtraTreesClassifier(n_estimators=150, min_density=0.09, n_jobs=1, criterion='gini', bootstrap=True, random_state=2),
			# ExtraTreesClassifier(n_estimators=150, min_density=0.2, n_jobs=1, criterion='entropy', bootstrap=True, random_state=5),
			# ExtraTreesClassifier(n_estimators=150, min_density=0.09, n_jobs=1, criterion='entropy', bootstrap=True, random_state=6),
			# GradientBoostingClassifier(learn_rate=0.05, subsample=0.5, max_depth=6, n_estimators=50),
			# GradientBoostingClassifier(learn_rate=0.02, subsample=0.2, max_depth=8, n_estimators=125),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=80),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=3, n_estimators=80),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=4, n_estimators=80),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=120),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=3, n_estimators=120),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=4, n_estimators=120),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=160),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=3, n_estimators=160),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=4, n_estimators=160),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=1, n_estimators=200),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=200),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=3, n_estimators=200),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=4, n_estimators=200),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=1, n_estimators=240),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=240),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=3, n_estimators=240),			
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=1, n_estimators=280),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=280),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=3, n_estimators=280),			
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=1, n_estimators=320),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=320),
			# RandomForestClassifier(n_estimators=150, min_density=0.2, n_jobs=1, criterion='gini', bootstrap=True, random_state=1),
			# RandomForestClassifier(n_estimators=150, min_density=0.09, n_jobs=1, criterion='gini', bootstrap=True, random_state=2),
			# RandomForestClassifier(n_estimators=150, min_density=0.2, n_jobs=1, criterion='entropy', bootstrap=True, random_state=5),
			# RandomForestClassifier(n_estimators=150, min_density=0.05, n_jobs=1, criterion='entropy', bootstrap=True, random_state=7)]
			
			
	# full algorithm stack.
	# clfs = [ExtraTreesClassifier(n_estimators=150, min_density=0.2, n_jobs=1, criterion='gini', bootstrap=True, random_state=1),
			# ExtraTreesClassifier(n_estimators=150, min_density=0.09, n_jobs=1, criterion='gini', bootstrap=True, random_state=2),
			# ExtraTreesClassifier(n_estimators=150, min_density=0.05, n_jobs=1, criterion='gini', bootstrap=True, random_state=3),
			# ExtraTreesClassifier(n_estimators=150, min_density=0.02, n_jobs=1, criterion='gini', bootstrap=True, random_state=4),
			# ExtraTreesClassifier(n_estimators=150, min_density=0.2, n_jobs=1, criterion='entropy', bootstrap=True, random_state=5),
			# ExtraTreesClassifier(n_estimators=150, min_density=0.09, n_jobs=1, criterion='entropy', bootstrap=True, random_state=6),
			# ExtraTreesClassifier(n_estimators=150, min_density=0.05, n_jobs=1, criterion='entropy', bootstrap=True, random_state=7),
			# ExtraTreesClassifier(n_estimators=150, min_density=0.02, n_jobs=1, criterion='entropy', bootstrap=True, random_state=8),
			# GradientBoostingClassifier(learn_rate=0.05, subsample=0.5, max_depth=6, n_estimators=50),
			# GradientBoostingClassifier(learn_rate=0.02, subsample=0.2, max_depth=8, n_estimators=125),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=80),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=3, n_estimators=80),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=4, n_estimators=80),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=120),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=3, n_estimators=120),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=4, n_estimators=120),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=160),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=3, n_estimators=160),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=4, n_estimators=160),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=1, n_estimators=200),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=200),
			# GradientBoostingClassifier(lesarn_rate=0.01, subsample=0.2, max_depth=3, n_estimators=200),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=4, n_estimators=200),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=1, n_estimators=240),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=240),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=3, n_estimators=240),			
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=1, n_estimators=280),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=280),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=3, n_estimators=280),			
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=1, n_estimators=320),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=320),
			# RandomForestClassifier(n_estimators=150, min_density=0.2, n_jobs=1, criterion='gini', bootstrap=True, random_state=1),
			# RandomForestClassifier(n_estimators=150, min_density=0.09, n_jobs=1, criterion='gini', bootstrap=True, random_state=2),
			# RandomForestClassifier(n_estimators=150, min_density=0.05, n_jobs=1, criterion='gini', bootstrap=True, random_state=3),
			# RandomForestClassifier(n_estimators=150, min_density=0.02, n_jobs=1, criterion='gini', bootstrap=True, random_state=4),
			# RandomForestClassifier(n_estimators=150, min_density=0.2, n_jobs=1, criterion='entropy', bootstrap=True, random_state=5),
			# RandomForestClassifier(n_estimators=150, min_density=0.09, n_jobs=1, criterion='entropy', bootstrap=True, random_state=6),
			# RandomForestClassifier(n_estimators=150, min_density=0.05, n_jobs=1, criterion='entropy', bootstrap=True, random_state=7),
			# RandomForestClassifier(n_estimators=150, min_density=0.02, n_jobs=1, criterion='entropy', bootstrap=True, random_state=8)]
	

	
	print "Data size: ", len(trainBase), len(test)
	dataset_blend_train = np.zeros((len(trainBase), len(clfs)))
	dataset_blend_test = np.zeros((len(test), len(clfs)))
	

	trainNew = []
	trainTestNew = []
	testNew = []
	trainNewSelect = []
	trainTestNewSelect = []
	testNewSelect = []
	
	print "Scaling"
	#targetPre = [x[0] for x in trainBase]
	#trainPre = [x[1:] for x in trainBase]
	#trainPreTemp = [x[1:] for x in trainBase]
	#testPre = [x[1:] for x in test]

	targetPre = [int(x[0]) for x in trainBase]
	trainPre = [[int(i) for i in x[1:]] for x in trainBase]
	trainPreTemp = [[int(i) for i in x[1:]] for x in trainBase]
	testPre = [[int(i) for i in x[1:]] for x in test]
	
	print "unique: ", len(list(set([x[1] for x in trainBase])))
	
	#enc = OneHotEncoder()
	#print len(trainPreTemp)
	#trainPreTemp.extend(testPre)
	#print len(trainPreTemp)
	#enc.fit(trainPreTemp)
	#print enc.n_values_
	#print enc.feature_indices_
	
	#out = enc.transform(trainPre)
	#trainPre = out#.toarray()
	#print out.shape # len(out), len(out[0])
	
	#out = enc.transform(testPre)
	#testPre = out#.toarray()
	#print out.shape
	
	km = KMeans(n_clusters=10, init='k-means++', n_init=100, max_iter=300, tol=0.0001, precompute_distances=True, verbose=0, random_state=None, copy_x=True, n_jobs=1).fit(trainPre)
	
	
	
	#return
	
	
	#print trainPre[0]
	#scaler = preprocessing.Scaler().fit(trainPre)
	#trainScaled = scaler.transform(trainPre)
	#testScaled = scaler.transform(testPre)	

	#print scaler.mean_
	#print scaler.std_
	print "Begin Training"
	
	
	for ExecutionIndex, clf in enumerate(clfs):
		print clf
		avg = 0
	
		predicted_list = []
			
		dataset_blend_test_set = np.zeros((len(test), NumFolds))

		
		foldCount = 0

		
		#Stratified for classification...[trainBase[i][0] for i in range(len(trainBase))]
		#Folds = cross_validation.StratifiedKFold(targetPre, n_folds=NumFolds, indices=True)
		Folds = cross_validation.KFold(len(trainBase), n_folds=NumFolds, indices=True)
		for train_index, test_index in Folds:

			#trainBaseTemp = [trainBase[i] for i in train_index]
			#target = [x[0] for x in trainBaseTemp]
			#train = [x[1:] for x in trainBaseTemp]
	
			#testBaseTemp = [trainBase[i] for i in test_index]
			#targetTest = [x[0] for x in testBaseTemp]
			#trainTest = [x[1:] for x in testBaseTemp]
		
			#test = [x[0:] for x in test]
	
			target = [targetPre[i] for i in train_index]
			train = [trainPre[i] for i in train_index]
			
			#train = trainPre.tocsr()[train_index,:]
			
			targetTest = [targetPre[i] for i in test_index]	
			trainTest = [trainPre[i] for i in test_index]	
			
			#trainTest = trainPre.tocsr()[test_index,:]
	
	
			print
			print "Iteration: ", foldCount
			#print "LEN: ", len(train), len(target)
		
			train = km.transform(train)
			trainTest = km.transform(trainTest)
		
			clf.fit(train, target)
			print "Predict"
			prob = clf.predict_proba(trainTest) 
			print "Score"
			dataset_blend_train[test_index, ExecutionIndex] = prob[:,1]



	
			probSum = 0
			weightSum = 0
			# totalOffByHalf = 0
			# totalPositive = 0
			# totalPositiveOffByHalf = 0
			# totalPositivePredictions = 0
			
			fpr, tpr, thresholds = metrics.roc_curve(targetTest, prob[:,1], pos_label=1)
			auc = metrics.auc(fpr,tpr)
			print "Score: ", auc
			
			#for i in range(0, len(prob)):
				#print prob
				#probX = prob[i]

				#probSum += weights[test_index[i]][0] * math.fabs(targetTest[i] - probX)
				#weightSum += weights[test_index[i]][0] 
				#print "Weight", weights[test_index[i]][0], "Index: ",i, "Test_Index: ",test_index[i] , "Actual: ", targetTest[i], "Predicted: ", probX
				
				# log loss cal
				#probSum += int(targetTest[i])*log(probX)+(1-int(targetTest[i]))*log(1-probX)
				# if ( math.fabs(probX - int(targetTest[i])) > 0.5 ):
					# totalOffByHalf = totalOffByHalf + 1		
			
				# if ( int(targetTest[i]) == 1 ):
					# totalPositive = totalPositive + 1
				# if ( int(targetTest[i]) == 1 and probX < 0.5):
					# totalPositiveOffByHalf = totalPositiveOffByHalf + 1
				# if (probX > 0.5):
					# totalPositivePredictions = totalPositivePredictions + 1			
			
			# print
			# print "Stats:"
			# print "Total Off By > 0.5 ", totalOffByHalf
			# print "Total Positive ", totalPositive
			# print "Total Positive Off By Half ", totalPositiveOffByHalf
			# print "Total Positive Predictions ", totalPositivePredictions
			#print -probSum/len(prob)
			#print "Score: ", probSum/weightSum
 
			avg += 	auc/NumFolds

			predicted_probs = clf.predict_proba(testPre) 	
			#predicted_list.append([x[1] for x in predicted_probs])	
			dataset_blend_test_set[:, foldCount] = predicted_probs[:,1] #[0]
		
				
			foldCount = foldCount + 1
		
			break
		
		dataset_blend_test[:,ExecutionIndex] = dataset_blend_test_set.mean(1)  
		
		#print "Saving NP"
		#np.savetxt('temp/dataset_blend_test_set.txt', dataset_blend_test_set)
		#np.savetxt('temp/dataset_blend_test_set.mean.txt', dataset_blend_test_set.mean(1) )
		#np.savetxt('temp/dataset_blend_test.txt', dataset_blend_test)
		#print "Done Saving NP"
		
		now = datetime.datetime.now()
		#print dataset_blend_test_set.mean(1) 
		csv_io.write_delimited_file_single_plus_index("../predictions/Stack_" + now.strftime("%Y%m%d%H%M%S") + "_" + str(avg) + "_" + str(clf)[:12] + ".csv", dataset_blend_test_set.mean(1))
		
		csv_io.write_delimited_file_single("../predictions/Target_Stack_" + now.strftime("%Y%m%d%H%M%S") + "_" + str(avg) + "_" + str(clf)[:12] + ".csv", dataset_blend_train[:,ExecutionIndex] )		
		
		csv_io.write_delimited_file("../predictions/RunLog.csv", [now.strftime("%Y %m %d %H %M %S"), str(avg), str(clf), str(NumFolds), model, "", ""], filemode="a",delimiter=",")
		
		
		print "------------------------Average: ", avg

		#np.savetxt('temp/dataset_blend_train.txt', dataset_blend_train)

	return dataset_blend_train, dataset_blend_test
Example #8
0
def run_stack(SEED):

    model = ""

    print "Running Stack."

    avg = 0
    NumFolds = 5  # 5 is good, but 10 yeilds a better mean since outliers are less significant.

    targetX = csv_io.read_data("target.csv", skipFirstLine=False, split=",")

    trainBase = csv_io.read_data("train1.csv", skipFirstLine=False, split=",")
    #test = csv_io_np.read_data("test1.csv", skipFirstLine = False, split = ",")

    trainBase = trainBase[0:5000]
    targetX = targetX[0:5000]

    train_saleID = csv_io.read_data("train_salesID.csv",
                                    skipFirstLine=False,
                                    split=",")
    test_salesID = csv_io.read_data("test_salesID.csv",
                                    skipFirstLine=False,
                                    split=",")

    predicted_list = []
    bootstrapLists = []

    clfs = [
        GradientBoostingRegressor(loss='ls',
                                  learn_rate=0.05,
                                  subsample=0.5,
                                  max_depth=6,
                                  n_estimators=300,
                                  random_state=166,
                                  min_samples_leaf=1)
    ]
    #GradientBoostingRegressor(loss='ls', learn_rate=0.05, subsample=0.5, max_depth=6, n_estimators=300, random_state=166, min_samples_leaf=1)
    #GradientBoostingRegressor(loss='lad', learn_rate=0.05, subsample=0.5, max_depth=6, n_estimators=3000, random_state=166, min_samples_leaf=1),
    #GradientBoostingRegressor(loss='huber', learn_rate=0.05, subsample=0.5, max_depth=6, n_estimators=3000, random_state=166, min_samples_leaf=1),

    print "Data size: ", len(trainBase), 11573  # len(test)
    dataset_blend_train = np.zeros((len(trainBase), len(clfs)))
    #dataset_blend_test = np.zeros((len(test), len(clfs)))
    dataset_blend_test = np.zeros(11573, len(clfs))

    #targetPre = target #[0:5000]
    #testScaled = test
    #trainScaled = trainBase #[0:5000]

    #targetPre = target #[0:5000]
    #testScaled = test
    #trainScaled = trainBase #[0:5000]

    print "Begin Training"

    lenTrainBase = len(trainBase)
    #lenTrainBase = len(trainBase[0:5000])

    lenTest = 11573
    #lenTest = len(test)

    gc.collect()

    for ExecutionIndex, clf in enumerate(clfs):
        print clf
        avg = 0

        predicted_list = []

        dataset_blend_test_set = np.zeros((lenTest, NumFolds))

        foldCount = 0

        #Stratified for classification...[trainBase[i][0] for i in range(len(trainBase))]
        Folds = cross_validation.KFold(lenTrainBase, k=NumFolds, indices=True)

        for train_index, test_index in Folds:

            target = [targetX[i] for i in train_index]
            train = [trainBase[i] for i in train_index]

            targetTest = [targetX[i] for i in test_index]
            trainTest = [trainBase[i] for i in test_index]

            #target = [targetPre[i] for i in train_index]
            #train = [trainScaled[i] for i in train_index]

            #targetTest = [targetPre[i] for i in test_index]
            #trainTest = [trainScaled[i] for i in test_index]

            gc.collect()
            print
            print "Iteration: ", foldCount
            print "LEN: ", len(train), len(target)

            #print train[0]
            #print target[0]
            #return

            print "Start", datetime.datetime.now()
            clf.fit(train, target)
            prob = clf.predict(trainTest)
            print "End  ", datetime.datetime.now()

            dataset_blend_train[test_index, ExecutionIndex] = prob

            gc.collect()

            probSum = 0
            weightSum = 0
            # totalOffByHalf = 0
            # totalPositive = 0
            # totalPositiveOffByHalf = 0
            # totalPositivePredictions = 0

            for i in range(0, len(prob)):
                probX = prob[i]
                probX = 31100.0
                print targetTest[i][0], probX
                probSum += math.pow(
                    math.log10(targetTest[i][0]) - math.log10(probX), 2)

                #probSum += weights[test_index[i]][0] * math.fabs(targetTest[i] - probX)
                #weightSum += weights[test_index[i]][0]

                #print "Weight", weights[test_index[i]][0], "Index: ",i, "Test_Index: ",test_index[i] , "Actual: ", targetTest[i], "Predicted: ", probX

                # log loss cal
                #probSum += int(targetTest[i])*log(probX)+(1-int(targetTest[i]))*log(1-probX)
                # if ( math.fabs(probX - int(targetTest[i])) > 0.5 ):
                # totalOffByHalf = totalOffByHalf + 1

                # if ( int(targetTest[i]) == 1 ):
                # totalPositive = totalPositive + 1
                # if ( int(targetTest[i]) == 1 and probX < 0.5):
                # totalPositiveOffByHalf = totalPositiveOffByHalf + 1
                # if (probX > 0.5):
                # totalPositivePredictions = totalPositivePredictions + 1

            # print
            # print "Stats:"
            # print "Total Off By > 0.5 ", totalOffByHalf
            # print "Total Positive ", totalPositive
            # print "Total Positive Off By Half ", totalPositiveOffByHalf
            # print "Total Positive Predictions ", totalPositivePredictions
            #print -probSum/len(prob)
            print "Score: ", math.sqrt(probSum / len(prob))

            avg += math.sqrt(probSum / len(prob)) / NumFolds

            gc.collect()

            fo = open("test1.csv", "r")
            predicted_probs = []

            for line in fo:
                line = line.strip().split(",")
                newRow = []
                for item in line:
                    newRow.append(float(item))

                predicted_probs.append(clf.predict(newRow))

            fo.close()

            #predicted_probs = clf.predict(testScaled)
            #predicted_list.append([x[1] for x in predicted_probs])
            dataset_blend_test_set[:, foldCount] = predicted_probs  #[0]
            gc.collect()

            foldCount = foldCount + 1

        dataset_blend_test[:, ExecutionIndex] = dataset_blend_test_set.mean(1)

        #print "Saving NP"
        #np.savetxt('temp/dataset_blend_test_set.txt', dataset_blend_test_set)
        #np.savetxt('temp/dataset_blend_test_set.mean.txt', dataset_blend_test_set.mean(1) )
        #np.savetxt('temp/dataset_blend_test.txt', dataset_blend_test)
        #print "Done Saving NP"

        now = datetime.datetime.now()
        #print dataset_blend_test_set.mean(1)
        csv_io.write_delimited_file_single(
            "../predictions/Stack_" + now.strftime("%Y%m%d%H%M%S") + "_" +
            str(avg) + "_" + str(clf)[:12] + ".csv",
            dataset_blend_test_set.mean(1))

        csv_io.write_delimited_file_single(
            "../predictions/Target_Stack_" + now.strftime("%Y%m%d%H%M%S") +
            "_" + str(avg) + "_" + str(clf)[:12] + ".csv",
            dataset_blend_train[:, ExecutionIndex])

        csv_io.write_delimited_file("../predictions/RunLog.csv", [
            now.strftime("%Y %m %d %H %M %S"), "AVG.",
            str(avg),
            str(clf), "Folds:",
            str(NumFolds), "Model", model, "", ""
        ],
                                    filemode="a",
                                    delimiter=",")

        print "------------------------Average: ", avg

        #np.savetxt('temp/dataset_blend_train.txt', dataset_blend_train)

    return dataset_blend_train, dataset_blend_test
Example #9
0
def run_stack(SEED):

    model = "base"

    trainBase = csv_io_np.read_data("PreProcessData/train.csv",
                                    skipFirstLine=True,
                                    split=",")
    test = csv_io_np.read_data("PreProcessData/test.csv",
                               skipFirstLine=True,
                               split=",")

    print "Data Read Complete"

    avg = 0
    NumFolds = 5

    predicted_list = []
    bootstrapLists = []

    # 100 producted 94%
    # 1000 did not finish in about 5+ hours...
    # 300 about 5 hours, .9691 on first CF
    # learn_rate=0.01, n_estimators=300, subsample=1.0, min_samples_split=30, 0.9386
    #		GradientBoostingClassifier(loss='deviance', learn_rate=0.1, n_estimators=300, subsample=1.0, min_samples_split=30, min_samples_leaf=1, max_depth=5, init=None, random_state=None, max_features=None)

    # Leader board of 98443, for 20th place.
    #SVC(C=10**6, kernel='rbf', degree=3, gamma=10**-6.35, coef0=0.0, shrinking=True, probability=False, tol=0.001, cache_size=200, class_weight=None, verbose=False)

    clfs = [
        SVC(C=10**6,
            kernel='rbf',
            degree=3,
            gamma=10**-6.35,
            coef0=0.0,
            shrinking=True,
            probability=False,
            tol=0.001,
            cache_size=200,
            class_weight=None,
            verbose=False)
    ]

    print "Data size: ", len(trainBase), len(test)
    dataset_blend_train = np.zeros((len(trainBase), len(clfs)))
    dataset_blend_test = np.zeros((len(test), len(clfs)))

    trainNew = []
    trainTestNew = []
    testNew = []
    trainNewSelect = []
    trainTestNewSelect = []
    testNewSelect = []

    print "Scaling"
    targetPre = [x[0] for x in trainBase]
    trainPre = [x[1:] for x in trainBase]
    testPre = [x[0:] for x in test]

    # image best restuls [-1, 1]
    #preprocessing.MinMaxScaler(feature_range=(0, 1), copy=True)

    #print trainPre[0]
    #scaler = preprocessing.Scaler().fit(trainPre)
    #trainScaled = scaler.transform(trainPre)
    #testScaled = scaler.transform(testPre)
    trainScaled = trainPre
    testScaled = testPre

    #print scaler.mean_
    #print scaler.std_
    print "Begin Training"

    lenTrainBase = len(trainBase)
    trainBase = []

    lenTest = len(test)
    test = []

    trainPre = []
    testPre = []

    gc.collect()

    for ExecutionIndex, clf in enumerate(clfs):
        print clf
        avg = 0

        predicted_list = []

        dataset_blend_test_set = np.zeros((lenTest, NumFolds))

        foldCount = 0

        #Stratified for classification...[trainBase[i][0] for i in range(len(trainBase))]
        Folds = cross_validation.KFold(lenTrainBase, k=NumFolds, indices=True)

        for train_index, test_index in Folds:

            #trainBaseTemp = [trainBase[i] for i in train_index]
            #target = [x[0] for x in trainBaseTemp]
            #train = [x[1:] for x in trainBaseTemp]

            #testBaseTemp = [trainBase[i] for i in test_index]
            #targetTest = [x[0] for x in testBaseTemp]
            #trainTest = [x[1:] for x in testBaseTemp]

            #test = [x[0:] for x in test]

            target = [targetPre[i] for i in train_index]
            train = [trainScaled[i] for i in train_index]

            targetTest = [targetPre[i] for i in test_index]
            trainTest = [trainScaled[i] for i in test_index]

            print
            print "Iteration: ", foldCount
            print "LEN: ", len(train), len(
                train[0]), len(target), len(trainTest), len(trainTest[0])

            print datetime.datetime.now()
            clf.fit(train, target)
            print datetime.datetime.now()
            prob = clf.predict(trainTest)

            dataset_blend_train[test_index, ExecutionIndex] = prob

            probSum = 0.0
            count = 0.0

            for i in range(0, len(prob)):
                probX = prob[i]  #[1]
                #print probX, targetTest[i]
                if (targetTest[i] == probX):
                    probSum += 1.0
                count = count + 1.0

            print "Sum: ", probSum, count
            print "Score: ", probSum / count

            avg += (probSum / count) / NumFolds

            #predicted_probs = clf.predict(testScaled)
            ######predicted_list.append([x[1] for x in predicted_probs])
            #dataset_blend_test_set[:, foldCount] = predicted_probs #[0]

            foldCount = foldCount + 1

        print "Final Train", datetime.datetime.now()
        clf.fit(trainScaled,
                targetPre)  # must to this for multiclass classification...
        print "Final Predict", datetime.datetime.now()
        predicted_probs = clf.predict(testScaled)
        print "Writing Data", datetime.datetime.now()
        #dataset_blend_test[:,ExecutionIndex] = dataset_blend_test_set.mean(1)

        now = datetime.datetime.now()

        #csv_io.write_delimited_file_single("../predictions/Stack_" + now.strftime("%Y%m%d%H%M%S") + "_" + str(avg) + "_" + str(clf)[:12] + ".csv", dataset_blend_test_set.mean(1))
        csv_io.write_delimited_file_single(
            "../predictions/Stack_" + now.strftime("%Y%m%d%H%M%S") + "_" +
            str(avg) + "_" + str(clf)[:12] + ".csv",
            predicted_probs)  # for multiclass

        csv_io.write_delimited_file_single(
            "../predictions/Target_Stack_" + now.strftime("%Y%m%d%H%M%S") +
            "_" + str(avg) + "_" + str(clf)[:12] + ".csv",
            dataset_blend_train[:, ExecutionIndex])

        csv_io.write_delimited_file("../predictions/RunLog.csv", [
            now.strftime("%Y %m %d %H %M %S"), "AVG.",
            str(avg),
            str(clf), "Folds:",
            str(NumFolds), "Model", model, "", ""
        ],
                                    filemode="a",
                                    delimiter=",")

        print "------------------------Average: ", avg

    return dataset_blend_train, dataset_blend_test
Example #10
0
def Blend():

	lossThreshold = 4.0  # best seems to be about 4.0
	model = "Long-Lat KNN5"

	#used only for targets values.
	trainBase = csv_io.read_data("PreProcessData/training_PreProcess4.csv", skipFirstLine = False, split = "\t")
	test = csv_io.read_data("PreProcessData/test_PreProcess4.csv", False)
	weights = csv_io.read_data("PreProcessData/Weights.csv", skipFirstLine = False)

	target = [x[0] for x in trainBase]
	
	
	
	
	stackFiles = []
	for filename in os.listdir("../predictions"):
		parts = filename.split("_")
		if ( filename[0:5] == "Stack" and float(parts[2]) < lossThreshold):

			stackFiles.append(filename)
	
	
	dataset_blend_train = np.zeros((len(trainBase), len(stackFiles)))
	dataset_blend_test = np.zeros((len(test), len(stackFiles)))
	
	print "Loading Data"
	for fileNum, file in enumerate(stackFiles):
		print file
		trn = csv_io.read_data("../predictions/Target_" + file, split="," ,skipFirstLine = False)
		for row, datum in enumerate(trn):
			dataset_blend_train[row, fileNum] = datum[0]
		
		tst = csv_io.read_data("../predictions/" + file, split="," ,skipFirstLine = False)
		for row, datum in enumerate(tst):
			dataset_blend_test[row, fileNum] = datum[0]

	np.savetxt('temp/dataset_blend_trainX.txt', dataset_blend_train)
	np.savetxt('temp/dataset_blend_testX.txt', dataset_blend_test)
	print "Num file processed: ", len(stackFiles), "Threshold: ", lossThreshold

	
	# linear 3.15 -> 3.42
	# RF 1.2 -> 3.5
	# GB (125) 3.15
	
	
	
	
	
	print "Starting Blend"
	#GB 400 is 3.11
	#GB 400 max_depth=14 is 2.82  greater depth is better.
	# GB seems to overfit. scores drop for 100 estimators = 3.33 with linear on same code, 3.27
	# might try smaller numbers in gb than 20 depth and 100 est with prevent overfitting.
	
	# clfs = [
		# GradientBoostingRegressor(learn_rate=0.05, subsample=0.2, max_depth=20, n_estimators=400, random_state=551),
		# GradientBoostingRegressor(learn_rate=0.05, subsample=0.2, max_depth=30, n_estimators=400, random_state=551),
		# GradientBoostingRegressor(learn_rate=0.05, subsample=0.2, max_depth=40, n_estimators=400, random_state=551),
		# GradientBoostingRegressor(learn_rate=0.05, subsample=0.2, max_depth=80, n_estimators=400, random_state=551),
		# GradientBoostingRegressor(learn_rate=0.05, subsample=0.2, max_depth=20, n_estimators=800, random_state=551),
		# GradientBoostingRegressor(learn_rate=0.05, subsample=0.2, max_depth=30, n_estimators=800, random_state=551),
		# GradientBoostingRegressor(learn_rate=0.05, subsample=0.2, max_depth=40, n_estimators=800, random_state=551),
		# GradientBoostingRegressor(learn_rate=0.05, subsample=0.2, max_depth=80, n_estimators=800, random_state=551)
	
		# ]
		
	clfs = [Ridge(alpha=1.0, fit_intercept=True, normalize=False, copy_X=True, tol=0.001)
	]	
	
	# this returned 2.95 when linear returned 3.06,  need to check for overfitting.
	#KNeighborsRegressor(n_neighbors=15, weights='uniform', algorithm='auto', leaf_size=30, warn_on_equidistant=False, p=2)
	
	# linear 3.06, lasso is 3.06
	#Lasso(alpha=1.0, fit_intercept=True, normalize=False, precompute='auto', copy_X=True, max_iter=1000, tol=0.0001, warm_start=False, positive=False)
	
	#linear 3.06, ridge 3.05
	#Ridge(alpha=1.0, fit_intercept=True, normalize=False, copy_X=True, tol=0.001)
	
	#linear 3.06, SVC 2.77, not sure if overfitting, need to submit to test**************
	#SVC(C=1.0, kernel='rbf', degree=3, gamma=0.0, coef0=0.0, shrinking=True, probability=False, tol=0.001, cache_size=200, class_weight=None, verbose=False)
	
	
	# clfs = [LinearRegression(fit_intercept=True, normalize=False, copy_X=True)
	# ]	
		
	#LinearRegression(fit_intercept=True, normalize=False, copy_X=True)
	# use for classification probablilities
	# clfs = [LogisticRegression(penalty='l2', dual=False, tol=0.0001, C=1.0, fit_intercept=True, intercept_scaling=1, class_weight=None),
			# LogisticRegression(penalty='l2', dual=False, tol=0.0001, C=0.5, fit_intercept=True, intercept_scaling=1, class_weight=None),
			# LogisticRegression(penalty='l2', dual=False, tol=0.0001, C=0.1, fit_intercept=True, intercept_scaling=1, class_weight=None),
			# LogisticRegression(penalty='l1', dual=False, tol=0.0001, C=1.0, fit_intercept=True, intercept_scaling=1, class_weight=None),
			# LogisticRegression(penalty='l1', dual=False, tol=0.0001, C=0.5, fit_intercept=True, intercept_scaling=1, class_weight=None),
			# LogisticRegression(penalty='l1', dual=False, tol=0.0001, C=0.1, fit_intercept=True, intercept_scaling=1, class_weight=None)]
	

	dataset_blend_test_set = np.zeros((len(test), len(clfs)))
	
	avgScore = 0.0
	for ExecutionIndex, clf in enumerate(clfs):
		print clf
		clf.fit(dataset_blend_train, target)
		submission = clf.predict(dataset_blend_test)
		
		submission = ["%f" % x for x in submission]
		now = datetime.datetime.now()

		

		
		# attempt to score the training set to predict score for blend...
		probSum = 0.0
		weightSum = 0
		
		trainPrediction = clf.predict(dataset_blend_train)
		for i in range(0, len(trainPrediction)):
			probX = trainPrediction[i]
			

			probSum += weights[i][0] * math.fabs(target[i] - probX)
			weightSum += weights[i][0]
			#probSum += int(target[i])*log(probX)+(1-int(target[i]))*log(1-probX)
			 
		print "Train Score: ", (probSum/weightSum)
		avgScore += (probSum/weightSum)
	
		csv_io.write_delimited_file("../blend/BlendSingle" + now.strftime("%Y%m%d%H%M%S") + "_" + str(probSum/weightSum)+ "_" + str(clf)[:12] + ".csv", submission)	
	
		csv_io.write_delimited_file("../blend/RunLog.csv", [now.strftime("%Y %m %d %H %M %S"), str(avgScore/len(clfs)), str(clf), "1", model, "", "", ", ".join(stackFiles)], filemode="a",delimiter=",")
	
		dataset_blend_test_set[:, ExecutionIndex] = submission
	
	
	print "Final Score: ", str(avgScore/len(clfs))
	
	csv_io.write_delimited_file_single("../blend/FinalBlend_" + now.strftime("%Y%m%d%H%M%S") + "_" + str(avgScore/len(clfs)) + ".csv", dataset_blend_test_set.mean(1))	
Example #11
0
        predicted_probs = clf.predict_proba(finalTestSparse)
        #predicted_list.append([x[1] for x in predicted_probs])
        dataset_blend_test_set[:, foldCount] = predicted_probs[:, 1]

        foldCount = foldCount + 1

        #break

    dataset_blend_test[:, ExecutionIndex] = dataset_blend_test_set.mean(1)

    now = datetime.datetime.now()

    csv_io.write_delimited_file_single_plus_index(
        "../predictions/Stack_" + now.strftime("%Y%m%d%H%M%S") +
        "_" + str(avg) + "_" + str(clf)[:12] + ".csv",
        dataset_blend_test_set.mean(1))
    csv_io.write_delimited_file_single(
        "../predictions/Target_Stack_" + now.strftime("%Y%m%d%H%M%S") + "_" +
        str(avg) + "_" + str(clf)[:12] + ".csv",
        dataset_blend_train[:, ExecutionIndex])
    csv_io.write_delimited_file("../predictions/RunLog.csv", [
        now.strftime("%Y %m %d %H %M %S"),
        str(avg),
        str(clf),
        str(NumFolds), model, "", ""
    ],
                                filemode="a",
                                delimiter=",")

    print "------------------------Average: ", avg
Example #12
0
def run_stack(SEED):


	model = "base"

	trainBase = csv_io_np.read_data("PreProcessData/train.csv", skipFirstLine = True, split = ",")
	test = csv_io_np.read_data("PreProcessData/test.csv", skipFirstLine = True, split = ",")

	print "Data Read Complete"
	
	avg = 0
	NumFolds = 5 


	predicted_list = []
	bootstrapLists = []

	# 100 producted 94% 
	# 1000 did not finish in about 5+ hours...
	# 300 about 5 hours, .9691 on first CF
	# learn_rate=0.01, n_estimators=300, subsample=1.0, min_samples_split=30, 0.9386
	#		GradientBoostingClassifier(loss='deviance', learn_rate=0.1, n_estimators=300, subsample=1.0, min_samples_split=30, min_samples_leaf=1, max_depth=5, init=None, random_state=None, max_features=None)
	
	# Leader board of 98443, for 20th place.
	#SVC(C=10**6, kernel='rbf', degree=3, gamma=10**-6.35, coef0=0.0, shrinking=True, probability=False, tol=0.001, cache_size=200, class_weight=None, verbose=False)

	clfs = [
		SVC(C=10**6, kernel='rbf', degree=3, gamma=10**-6.35, coef0=0.0, shrinking=True, probability=False, tol=0.001, cache_size=200, class_weight=None, verbose=False)
		]		
	
	
	
	print "Data size: ", len(trainBase), len(test)
	dataset_blend_train = np.zeros((len(trainBase), len(clfs)))
	dataset_blend_test = np.zeros((len(test), len(clfs)))
	

	trainNew = []
	trainTestNew = []
	testNew = []
	trainNewSelect = []
	trainTestNewSelect = []
	testNewSelect = []
	
	print "Scaling"
	targetPre = [x[0] for x in trainBase]
	trainPre = [x[1:] for x in trainBase]
	testPre = [x[0:] for x in test]
	
	# image best restuls [-1, 1]
	#preprocessing.MinMaxScaler(feature_range=(0, 1), copy=True)
	
	#print trainPre[0]
	#scaler = preprocessing.Scaler().fit(trainPre)
	#trainScaled = scaler.transform(trainPre)
	#testScaled = scaler.transform(testPre)	
	trainScaled = trainPre
	testScaled = testPre
	
	#print scaler.mean_
	#print scaler.std_
	print "Begin Training"
	
	lenTrainBase = len(trainBase)
	trainBase = []
	
	lenTest = len(test)
	test = []
	
	trainPre = []
	testPre = []
	
	gc.collect()
	
	for ExecutionIndex, clf in enumerate(clfs):
		print clf
		avg = 0
	
		predicted_list = []
			
		dataset_blend_test_set = np.zeros((lenTest, NumFolds))

		
		foldCount = 0

		
		#Stratified for classification...[trainBase[i][0] for i in range(len(trainBase))]
		Folds = cross_validation.KFold(lenTrainBase, k=NumFolds, indices=True)
			
		
		
		for train_index, test_index in Folds:

			#trainBaseTemp = [trainBase[i] for i in train_index]
			#target = [x[0] for x in trainBaseTemp]
			#train = [x[1:] for x in trainBaseTemp]
	
			#testBaseTemp = [trainBase[i] for i in test_index]
			#targetTest = [x[0] for x in testBaseTemp]
			#trainTest = [x[1:] for x in testBaseTemp]
		
			#test = [x[0:] for x in test]
	
			target = [targetPre[i] for i in train_index]
			train = [trainScaled[i] for i in train_index]
			
			targetTest = [targetPre[i] for i in test_index]	
			trainTest = [trainScaled[i] for i in test_index]	
	
			print
			print "Iteration: ", foldCount
			print "LEN: ", len(train), len(train[0]), len(target), len(trainTest), len(trainTest[0])
			
			print datetime.datetime.now()
			clf.fit(train, target)
			print datetime.datetime.now()
			prob = clf.predict(trainTest) 
			
			dataset_blend_train[test_index, ExecutionIndex] = prob



	
			probSum = 0.0
			count = 0.0

			
			for i in range(0, len(prob)):
				probX = prob[i]#[1]
				#print probX, targetTest[i]
				if ( targetTest[i] == probX ) :
					probSum += 1.0
				count = count + 1.0
		
			print "Sum: ", probSum, count
			print "Score: ", probSum/count
 
			avg += 	(probSum/count)/NumFolds

			
			#predicted_probs = clf.predict(testScaled) 	
			######predicted_list.append([x[1] for x in predicted_probs])	
			#dataset_blend_test_set[:, foldCount] = predicted_probs #[0]
		
				
			foldCount = foldCount + 1
		
		print "Final Train", datetime.datetime.now()
		clf.fit(trainScaled, targetPre)  # must to this for multiclass classification...
		print "Final Predict", datetime.datetime.now()
		predicted_probs = clf.predict(testScaled) 	
		print "Writing Data", datetime.datetime.now()
		#dataset_blend_test[:,ExecutionIndex] = dataset_blend_test_set.mean(1)  
		

		
		now = datetime.datetime.now()

		#csv_io.write_delimited_file_single("../predictions/Stack_" + now.strftime("%Y%m%d%H%M%S") + "_" + str(avg) + "_" + str(clf)[:12] + ".csv", dataset_blend_test_set.mean(1))
		csv_io.write_delimited_file_single("../predictions/Stack_" + now.strftime("%Y%m%d%H%M%S") + "_" + str(avg) + "_" + str(clf)[:12] + ".csv", predicted_probs) # for multiclass
		
		csv_io.write_delimited_file_single("../predictions/Target_Stack_" + now.strftime("%Y%m%d%H%M%S") + "_" + str(avg) + "_" + str(clf)[:12] + ".csv", dataset_blend_train[:,ExecutionIndex] )		
		
		csv_io.write_delimited_file("../predictions/RunLog.csv", [now.strftime("%Y %m %d %H %M %S"), "AVG." , str(avg), str(clf), "Folds:", str(NumFolds), "Model", model, "", ""], filemode="a",delimiter=",")
		
		
		print "------------------------Average: ", avg



	return dataset_blend_train, dataset_blend_test
Example #13
0
def run_stack(SEED):


	train, test = util.get_train_test_df()

	columns = set(train.columns)
	columns.remove("SalesID")
	columns.remove("SalePrice")
	columns.remove("saledate")

	
	train_fea = get_date_dataframe(train["saledate"])
	test_fea = get_date_dataframe(test["saledate"])

	for col in columns:
		types = set(type(x) for x in train[col])
		if str in types:
			s = set(x for x in train[col])
			str_to_categorical = defaultdict(lambda: -1, [(x[1], x[0]) for x in enumerate(s)])
			train_fea = train_fea.join(pd.DataFrame({col: [str_to_categorical[x] for x in train[col]]}, index=train.index))
			test_fea = test_fea.join(pd.DataFrame({col: [str_to_categorical[x] for x in test[col]]}, index=test.index))
		else:
			train_fea = train_fea.join(train[col])
			test_fea = test_fea.join(test[col])


	model = "" 
	print "Running Stack."

	
	avg = 0
	NumFolds = 5 # 5 is good, but 10 yeilds a better mean since outliers are less significant. 

	#targetX = csv_io.read_data("target.csv", skipFirstLine = False, split = ",")
	#trainBase = csv_io.read_data("train1.csv", skipFirstLine = False, split = ",")	
	#test = csv_io_np.read_data("test1.csv", skipFirstLine = False, split = ",")

	#trainBase = trainBase[0:5000]
	#targetX = targetX[0:5000]
	
	#train_saleID = csv_io.read_data("train_salesID.csv", skipFirstLine = False, split = ",")
	#test_salesID = csv_io.read_data("test_salesID.csv", skipFirstLine = False, split = ",")
	

	predicted_list = []
	bootstrapLists = []


	clfs = [
	
			GradientBoostingRegressor(loss='lad', learn_rate=0.05, subsample=0.5, max_depth=6, n_estimators=3000, random_state=166, min_samples_leaf=1)
	]		
	#GradientBoostingRegressor(loss='ls', learn_rate=0.05, subsample=0.5, max_depth=6, n_estimators=300, random_state=166, min_samples_leaf=1)	
	#GradientBoostingRegressor(loss='lad', learn_rate=0.05, subsample=0.5, max_depth=6, n_estimators=3000, random_state=166, min_samples_leaf=1),	
	#GradientBoostingRegressor(loss='huber', learn_rate=0.05, subsample=0.5, max_depth=6, n_estimators=3000, random_state=166, min_samples_leaf=1),	
	

	#train_fea, train["SalePrice"]
	print "Data size: ", len(train_fea) , len(test_fea)
	#dataset_blend_train = np.zeros((len(train_fea), len(clfs)))
	#dataset_blend_test = np.zeros((len(test), len(clfs)))
	dataset_blend_test = np.zeros((len(test_fea), len(clfs))) # np.zeros(len(train_fea), len(clfs))	
	dataset_blend_train = np.zeros((len(train_fea), len(clfs)))	

	
	print "Begin Training"

	lenTrainBase = 401125 # len(train_fea)



	lenTest = 11573 # len(test_fea)
	gc.collect()
	
	for ExecutionIndex, clf in enumerate(clfs):
		print clf
		avg = 0
	
		predicted_list = []
			
		dataset_blend_test_set = np.zeros((lenTest, NumFolds))
		
		foldCount = 0
		
		#Stratified for classification...[trainBase[i][0] for i in range(len(trainBase))]
		Folds = cross_validation.KFold(lenTrainBase, k=NumFolds, indices=True)
			
		
		for train_index, test_index in Folds:

			targetX = [train["SalePrice"][i] for i in train_index]
			trainX = [train_fea.ix[i] for i in train_index]
			
			targetTest = [train["SalePrice"][i] for i in test_index]	
			trainTest = [train_fea.ix[i] for i in test_index]
			

			gc.collect()
			print
			print "Iteration: ", foldCount
			print "LEN: ", len(trainX), len(targetX)
			
			#print trainX[0]
			#print target[0]
			#return
			
			print "Start", datetime.datetime.now()
			clf.fit(trainX, targetX)
			prob = clf.predict(trainTest) 
			print "End  ", datetime.datetime.now()
			
			dataset_blend_train[test_index, ExecutionIndex] = prob

			gc.collect()

	
			probSum = 0
			weightSum = 0
			# totalOffByHalf = 0
			# totalPositive = 0
			# totalPositiveOffByHalf = 0
			# totalPositivePredictions = 0
			
			for i in range(0, len(prob)):
				probX = prob[i]
				#print targetTest[i], probX
				
				if probX < 0: # some are comming out negative.
					probX = -probX			

				probSum += math.pow(math.log10(targetTest[i]) - math.log10(probX), 2)
				
				#probSum += weights[test_index[i]][0] * math.fabs(targetTest[i] - probX)
				#weightSum += weights[test_index[i]][0] 
				
				
				#print "Weight", weights[test_index[i]][0], "Index: ",i, "Test_Index: ",test_index[i] , "Actual: ", targetTest[i], "Predicted: ", probX
				
				# log loss cal
				#probSum += int(targetTest[i])*log(probX)+(1-int(targetTest[i]))*log(1-probX)
				# if ( math.fabs(probX - int(targetTest[i])) > 0.5 ):
					# totalOffByHalf = totalOffByHalf + 1		
			
				# if ( int(targetTest[i]) == 1 ):
					# totalPositive = totalPositive + 1
				# if ( int(targetTest[i]) == 1 and probX < 0.5):
					# totalPositiveOffByHalf = totalPositiveOffByHalf + 1
				# if (probX > 0.5):
					# totalPositivePredictions = totalPositivePredictions + 1			
			
			# print
			# print "Stats:"
			# print "Total Off By > 0.5 ", totalOffByHalf
			# print "Total Positive ", totalPositive
			# print "Total Positive Off By Half ", totalPositiveOffByHalf
			# print "Total Positive Predictions ", totalPositivePredictions
			#print -probSum/len(prob)
			print "Score: ", math.sqrt(probSum/len(prob))
 
			avg += 	math.sqrt(probSum/len(prob))/NumFolds

			gc.collect()
			
		
			predicted_probs = []
			
			for i in range(0,lenTest):
				predicted_probs.append(clf.predict(test_fea.ix[i]))
				
			
			#predicted_probs = clf.predict(testScaled) 	
			#predicted_list.append([x[1] for x in predicted_probs])	
			dataset_blend_test_set[:, foldCount] = predicted_probs #[0]
			gc.collect()
				
			foldCount = foldCount + 1
		
		dataset_blend_test[:,ExecutionIndex] = dataset_blend_test_set.mean(1)  
		
		#print "Saving NP"
		#np.savetxt('temp/dataset_blend_test_set.txt', dataset_blend_test_set)
		#np.savetxt('temp/dataset_blend_test_set.mean.txt', dataset_blend_test_set.mean(1) )
		#np.savetxt('temp/dataset_blend_test.txt', dataset_blend_test)
		#print "Done Saving NP"
		
		now = datetime.datetime.now()
		#print dataset_blend_test_set.mean(1) 
		csv_io.write_delimited_file_single("../predictions/Stack_" + now.strftime("%Y%m%d%H%M%S") + "_" + str(avg) + "_" + str(clf)[:12] + ".csv", dataset_blend_test_set.mean(1))
		
		csv_io.write_delimited_file_single("../predictions/Target_Stack_" + now.strftime("%Y%m%d%H%M%S") + "_" + str(avg) + "_" + str(clf)[:12] + ".csv", dataset_blend_train[:,ExecutionIndex] )		
		
		csv_io.write_delimited_file("../predictions/RunLog.csv", [now.strftime("%Y %m %d %H %M %S"), "AVG." , str(avg), str(clf), "Folds:", str(NumFolds), "Model", model, "", ""], filemode="a",delimiter=",")
		
		
		print "------------------------Average: ", avg

		#np.savetxt('temp/dataset_blend_train.txt', dataset_blend_train)

	return dataset_blend_train, dataset_blend_test