Esempio n. 1
0
def Blend():

    trainBase = csv_io.read_data("PreProcessData/training_PreProcess2.csv",
                                 False)
    target = [x[0] for x in trainBase]

    dataset_blend_train, dataset_blend_test = stack_gb.run_stack()

    clf = LogisticRegression()
    clf.fit(dataset_blend_train, target)
    submission = clf.predict_proba(dataset_blend_test)[:, 1]

    submission = ["%f" % x for x in submission]
    now = datetime.datetime.now()
    csv_io.write_delimited_file_GUID(
        "../Submissions/stack_" + now.strftime("%Y%m%d%H%M") + ".csv",
        "PreProcessData/test_PatientGuid.csv", submission)

    # attempt to score the training set to predict score for blend...
    probSum = 0.0
    trainPrediction = clf.predict_proba(dataset_blend_train)[:, 1]
    for i in range(0, len(trainPrediction)):
        probX = trainPrediction[i]
        if (probX > 0.999):
            probX = 0.999
        if (probX < 0.001):
            probX = 0.001

        probSum += int(
            target[i]) * log(probX) + (1 - int(target[i])) * log(1 - probX)

    print "Train Score: ", (-probSum / len(trainPrediction))

    var = raw_input("Enter to terminate.")
Esempio n. 2
0
def Blend():

	trainBase = csv_io.read_data("PreProcessData/training_PreProcess2.csv", False)
	target = [x[0] for x in trainBase]
	
	dataset_blend_train, dataset_blend_test = stack_knn.run_stack()

	clf = LogisticRegression()
	clf.fit(dataset_blend_train, target)
	submission = clf.predict_proba(dataset_blend_test)[:,1]
	
 	submission = ["%f" % x for x in submission]
	now = datetime.datetime.now()
	csv_io.write_delimited_file_GUID("../Submissions/stack" + now.strftime("%Y%m%d%H%M") + ".csv", "PreProcessData/test_PatientGuid.csv", submission)	
	

	
	# attempt to score the training set to predict score for blend...
	probSum = 0.0
	trainPrediction = clf.predict_proba(dataset_blend_train)[:,1]
	for i in range(0, len(trainPrediction)):
		probX = trainPrediction[i]
		if ( probX > 0.999):
			probX = 0.999;		
		if ( probX < 0.001):
			probX = 0.001;

		probSum += int(target[i])*log(probX)+(1-int(target[i]))*log(1-probX)
		 
	print "Train Score: ", (-probSum/len(trainPrediction))
	
	
	
	var = raw_input("Enter to terminate.")	
Esempio n. 3
0
File: blend.py Progetto: mb16/Kaggle
def Blend():

	trainBase = csv_io.read_data("PreProcessData/training_PreProcess2.csv", False)
	
	SEED = 448
	random.seed(SEED)
	random.shuffle(trainBase)
	
	target = [x[0] for x in trainBase]
	
	dataset_blend_train, dataset_blend_test = stack.run_stack(SEED)

	clfs = [LogisticRegression(penalty='l2', dual=False, tol=0.0001, C=1.0, fit_intercept=True, intercept_scaling=1, class_weight=None),
			LogisticRegression(penalty='l2', dual=False, tol=0.0001, C=0.5, fit_intercept=True, intercept_scaling=1, class_weight=None),
			LogisticRegression(penalty='l2', dual=False, tol=0.0001, C=0.1, fit_intercept=True, intercept_scaling=1, class_weight=None),
			LogisticRegression(penalty='l1', dual=False, tol=0.0001, C=1.0, fit_intercept=True, intercept_scaling=1, class_weight=None),
			LogisticRegression(penalty='l1', dual=False, tol=0.0001, C=0.5, fit_intercept=True, intercept_scaling=1, class_weight=None),
			LogisticRegression(penalty='l1', dual=False, tol=0.0001, C=0.1, fit_intercept=True, intercept_scaling=1, class_weight=None)]
	
	test = csv_io.read_data("PreProcessData/test_PreProcess2.csv", False)
	dataset_blend_test_j = np.zeros((len(test), len(clfs)))
	
	for ExecutionIndex, clf in enumerate(clfs):
		#clf = LogisticRegression()
		clf.fit(dataset_blend_train, target)
		submission = clf.predict_proba(dataset_blend_test)[:,1]
		
		submission = ["%f" % x for x in submission]
		now = datetime.datetime.now()
		csv_io.write_delimited_file_GUID("../Submissions/stack" + now.strftime("%Y%m%d%H%M%S") + ".csv", "PreProcessData/test_PatientGuid.csv", submission)	
		

		
		# attempt to score the training set to predict score for blend...
		probSum = 0.0
		trainPrediction = clf.predict_proba(dataset_blend_train)[:,1]
		for i in range(0, len(trainPrediction)):
			probX = trainPrediction[i]
			if ( probX > 0.999):
				probX = 0.999;		
			if ( probX < 0.001):
				probX = 0.001;

			probSum += int(target[i])*log(probX)+(1-int(target[i]))*log(1-probX)
			 
		print "Train Score: ", (-probSum/len(trainPrediction))
	
		dataset_blend_test_j[:, ExecutionIndex] = submission
	
	
	
	csv_io.write_delimited_file_GUID_numpy("../Submissions/stack_LG_" + now.strftime("%Y%m%d%H%M%S") + ".csv", "PreProcessData/test_PatientGuid.csv", dataset_blend_test_j.mean(1))	
	var = raw_input("Enter to terminate.")	
Esempio n. 4
0
def Blend():

    trainBase = csv_io.read_data("PreProcessData/training_PreProcess2.csv",
                                 False)

    SEED = 448
    random.seed(SEED)
    random.shuffle(trainBase)

    target = [x[0] for x in trainBase]

    dataset_blend_train, dataset_blend_test = stack.run_stack(SEED)

    clfs = [
        LogisticRegression(penalty='l2',
                           dual=False,
                           tol=0.0001,
                           C=1.0,
                           fit_intercept=True,
                           intercept_scaling=1,
                           class_weight=None),
        LogisticRegression(penalty='l2',
                           dual=False,
                           tol=0.0001,
                           C=0.5,
                           fit_intercept=True,
                           intercept_scaling=1,
                           class_weight=None),
        LogisticRegression(penalty='l2',
                           dual=False,
                           tol=0.0001,
                           C=0.1,
                           fit_intercept=True,
                           intercept_scaling=1,
                           class_weight=None),
        LogisticRegression(penalty='l1',
                           dual=False,
                           tol=0.0001,
                           C=1.0,
                           fit_intercept=True,
                           intercept_scaling=1,
                           class_weight=None),
        LogisticRegression(penalty='l1',
                           dual=False,
                           tol=0.0001,
                           C=0.5,
                           fit_intercept=True,
                           intercept_scaling=1,
                           class_weight=None),
        LogisticRegression(penalty='l1',
                           dual=False,
                           tol=0.0001,
                           C=0.1,
                           fit_intercept=True,
                           intercept_scaling=1,
                           class_weight=None)
    ]

    test = csv_io.read_data("PreProcessData/test_PreProcess2.csv", False)
    dataset_blend_test_j = np.zeros((len(test), len(clfs)))

    for ExecutionIndex, clf in enumerate(clfs):
        #clf = LogisticRegression()
        clf.fit(dataset_blend_train, target)
        submission = clf.predict_proba(dataset_blend_test)[:, 1]

        submission = ["%f" % x for x in submission]
        now = datetime.datetime.now()
        csv_io.write_delimited_file_GUID(
            "../Submissions/stack" + now.strftime("%Y%m%d%H%M%S") + ".csv",
            "PreProcessData/test_PatientGuid.csv", submission)

        # attempt to score the training set to predict score for blend...
        probSum = 0.0
        trainPrediction = clf.predict_proba(dataset_blend_train)[:, 1]
        for i in range(0, len(trainPrediction)):
            probX = trainPrediction[i]
            if (probX > 0.999):
                probX = 0.999
            if (probX < 0.001):
                probX = 0.001

            probSum += int(
                target[i]) * log(probX) + (1 - int(target[i])) * log(1 - probX)

        print "Train Score: ", (-probSum / len(trainPrediction))

        dataset_blend_test_j[:, ExecutionIndex] = submission

    csv_io.write_delimited_file_GUID_numpy(
        "../Submissions/stack_LG_" + now.strftime("%Y%m%d%H%M%S") + ".csv",
        "PreProcessData/test_PatientGuid.csv", dataset_blend_test_j.mean(1))
    var = raw_input("Enter to terminate.")
Esempio n. 5
0
def run_stack():

	trainBase = csv_io.read_data("PreProcessData/training_PreProcess2.csv", False)
	test = csv_io.read_data("PreProcessData/test_PreProcess2.csv", False)
	
	avg = 0
	NumFolds = 5 # should be odd for median

	predicted_list = []
	
	spanDistance = 12
	bootstrapLists = []
	
	clfs = [RandomForestClassifier(n_estimators=100, n_jobs=1, criterion='gini'),
            RandomForestClassifier(n_estimators=100, n_jobs=1, criterion='entropy'),
            ExtraTreesClassifier(n_estimators=100, n_jobs=1, criterion='gini'),
            ExtraTreesClassifier(n_estimators=100, n_jobs=1, criterion='entropy'),
            GradientBoostingClassifier(learn_rate=0.05, subsample=0.5, max_depth=6, n_estimators=50)]
		

	
	print len(trainBase), len(test)
	dataset_blend_train = np.zeros((len(trainBase), len(clfs)))
	dataset_blend_test = np.zeros((len(test), len(clfs)))
	
	for ExecutionIndex, clf in enumerate(clfs):
		print clf
		
		predicted_list = []
		avg = 0
		
		dataset_blend_test_j = np.zeros((len(test), NumFolds))
		
		foldCount = 0
		
		#print [trainBase[i][0] for i in range(len(trainBase))]
		#Folds = cross_validation.KFold(len(trainBase) - 1, k=NumFolds, indices=True, shuffle=False, random_state=None)		
		Folds = cross_validation.StratifiedKFold([trainBase[i][0] for i in range(len(trainBase))], k=NumFolds, indices=True)
		for train_index, test_index in Folds:

			trainBaseTemp = [trainBase[i] for i in train_index]
			target = [x[0] for x in trainBaseTemp]
			train = [x[1:] for x in trainBaseTemp]
	
			testBaseTemp = [trainBase[i] for i in test_index]
			targetTest = [x[0] for x in testBaseTemp]
			trainTest = [x[1:] for x in testBaseTemp]
	
	

			test = [x[0:] for x in test]
	

			#rf = RandomForestClassifier(n_estimators=n_est, criterion='entropy', max_depth=None, min_samples_split=1, min_samples_leaf=1, min_density=0.10000000000000001, max_features='auto', bootstrap=True, compute_importances=False, oob_score=False, n_jobs=1, random_state=None, verbose=0) # , max_features=None

			clf.fit(train, target)
			prob = clf.predict_proba(trainTest) 
			
			dataset_blend_train[test_index, ExecutionIndex] = prob[:,1] 
			
	
			probSum = 0
			totalOffByHalf = 0
			totalPositive = 0
			totalPositiveOffByHalf = 0
			totalPositivePredictions = 0
			
			for i in range(0, len(prob)):
				probX = prob[i][1] # [1]
				if ( probX > 0.999):
					probX = 0.999;		
				if ( probX < 0.001):
					probX = 0.001;
				#print i, probSum, probX, targetTest[i]
				#print target[i]*log(probX), (1-target[i])*log(1-probX)
				probSum += int(targetTest[i])*log(probX)+(1-int(targetTest[i]))*log(1-probX)
				if ( math.fabs(probX - int(targetTest[i])) > 0.5 ):
					totalOffByHalf = totalOffByHalf + 1		
			
				if ( int(targetTest[i]) == 1 ):
					totalPositive = totalPositive + 1
				if ( int(targetTest[i]) == 1 and probX < 0.5):
					totalPositiveOffByHalf = totalPositiveOffByHalf + 1
				if (probX > 0.5):
					totalPositivePredictions = totalPositivePredictions + 1			
			
			print "Total Off By > 0.5 ", totalOffByHalf
			print "Total Positive ", totalPositive
			print "Total Positive Off By Half ", totalPositiveOffByHalf
			print "Total Positive Predictions ", totalPositivePredictions
			print -probSum/len(prob)
	
 
			avg += 	(-probSum/len(prob))/NumFolds

			predicted_probs = clf.predict_proba(test)  # was test						
			#print [x[1] for x in predicted_probs]
			predicted_list.append([x[1] for x in predicted_probs])
		
			dataset_blend_test_j[:, foldCount] = predicted_probs[:,1]
		
			foldCount = foldCount + 1
		
		dataset_blend_test[:,ExecutionIndex] = dataset_blend_test_j.mean(1)
		
		
		print "------------------------Average: ", avg

		avg_list = np.zeros(len(test))
		med_list = np.zeros(len(test))
	
		# For N folds, get the average/median for each prediction item in test set.
		for p in range(0, len(test)):
			temp_list =[]	
			for q in range(0, len(predicted_list)):		
				temp_list.append(  predicted_list[q][p]) 
			
			avg_list[p] = mean(temp_list) 
			med_list[p] = getMedian(temp_list) 
		
			#print p, q, temp_list, mean(temp_list), getMedian(temp_list)
			

		bootstrapLists.append(avg_list)

		
	# This would be used if we ran multiple runs with different training values.
	# Primitive stacking, should rather save data, and do formal stacking.
	if ( len(bootstrapLists) > 1 ):
		finalList = []
		for p in range(0, len(test)):
			temp_list =[]	
			for q in range(0, len(bootstrapLists)):		
				temp_list.append(  bootstrapLists[q][p]) 
			
			finalList.append( meanSpan(temp_list, spanDistance) )
		
			#print p, q, temp_list, meanSpan(temp_list, spanDistance)
	else:
		finalList = bootstrapLists[0]		
		
	#finalList = SimpleScale(finalList)
	avg_values = ["%f" % x for x in finalList]
	csv_io.write_delimited_file_GUID("../Submissions/rf2_5fold_avg.csv", "PreProcessData/test_PatientGuid.csv", avg_values)	
	
	#for rec in dataset_blend_train:
	#	print rec
	
	return dataset_blend_train, dataset_blend_test
Esempio n. 6
0
def run_stack():
    print "Running GB Stack"

    trainBase = csv_io.read_data("PreProcessData/training_PreProcess2.csv",
                                 False)
    test = csv_io.read_data("PreProcessData/test_PreProcess2.csv", False)

    avg = 0
    NumFolds = 5  # should be odd for median

    NumFeatures = 1000

    predicted_list = []

    spanDistance = 12
    bootstrapLists = []

    #clfs = [GradientBoostingClassifier(learn_rate=0.05, subsample=0.5, max_depth=6, n_estimators=50),
    #		GradientBoostingClassifier(learn_rate=0.02, subsample=0.2, max_depth=8, n_estimators=125),
    #		RandomForestClassifier(n_estimators=100, n_jobs=1, criterion='gini',compute_importances=True),
    #        RandomForestClassifier(n_estimators=100, n_jobs=1, criterion='entropy',compute_importances=True),
    #        ExtraTreesClassifier(n_estimators=100, n_jobs=1, criterion='gini', compute_importances=True),
    #        ExtraTreesClassifier(n_estimators=100, n_jobs=1, criterion='entropy',compute_importances=True)]

    rnd_start = 456

    #n_estArr = [40, 80, 120] #20,40,80,160,,640,1280,4000,8000,16000
    #learn_rArr =  [0.5, 0.2, 0.1, 0.05, 0.01, 0.005, 0.001]
    n_estArr = [0.8, 0.4, 0.2, 0.1, 0.05,
                0.025]  #20,40,80,160,,640,1280,4000,8000,16000
    learn_rArr = [4, 8, 12, 18]

    print len(trainBase), len(test)
    dataset_blend_train = np.zeros(
        (len(trainBase), len(n_estArr) * len(learn_rArr)))
    dataset_blend_test = np.zeros((len(test), len(n_estArr) * len(learn_rArr)))

    trainNew = []
    trainTestNew = []
    testNew = []
    trainNewSelect = []
    trainTestNewSelect = []
    testNewSelect = []

    print "Start Feaure Select"
    #f_classif(np.array([x[1:] for x in trainBase]), np.array([x[0] for x in trainBase]))
    #print "done1"
    #fs = SelectKBest(chi2, k=NumFeatures)
    #fs.fit(scipy.array([x[1:] for x in trainBase]), scipy.array([x[0] for x in trainBase]))
    #fs.fit(np.array([x[1:] for x in trainBase]), np.array([x[0] for x in trainBase]))
    print "End Feaure Select"

    LastClassifier = ""
    ExecutionIndex = 0

    #for ExecutionIndex, clf in enumerate(clfs):
    for n_est in n_estArr:
        for learn_r in learn_rArr:
            print "n_est ", n_est, "learn_r ", learn_r
            #clf = GradientBoostingClassifier(loss='deviance', learn_rate=learn_r, n_estimators=n_est, subsample=0.2, min_samples_split=1, min_samples_leaf=1, max_depth=8, init=None, random_state=rnd_start)
            clf = GradientBoostingClassifier(loss='deviance',
                                             learn_rate=0.05,
                                             n_estimators=50,
                                             subsample=n_est,
                                             min_samples_split=1,
                                             min_samples_leaf=1,
                                             max_depth=learn_r,
                                             init=None,
                                             random_state=rnd_start)

            print clf
            avg = 0

            predicted_list = []

            dataset_blend_test_j = np.zeros((len(test), NumFolds))

            foldCount = 0

            #print [trainBase[i][0] for i in range(len(trainBase))]
            #Folds = cross_validation.KFold(len(trainBase) - 1, k=NumFolds, indices=True, shuffle=False, random_state=None)
            Folds = cross_validation.StratifiedKFold(
                [trainBase[i][0] for i in range(len(trainBase))],
                k=NumFolds,
                indices=True)
            for train_index, test_index in Folds:

                trainBaseTemp = [trainBase[i] for i in train_index]
                target = [x[0] for x in trainBaseTemp]
                train = [x[1:] for x in trainBaseTemp]

                testBaseTemp = [trainBase[i] for i in test_index]
                targetTest = [x[0] for x in testBaseTemp]
                trainTest = [x[1:] for x in testBaseTemp]

                test = [x[0:] for x in test]

                #rf = RandomForestClassifier(n_estimators=n_est, criterion='entropy', max_depth=None, min_samples_split=1, min_samples_leaf=1, min_density=0.10000000000000001, max_features='auto', bootstrap=True, compute_importances=False, oob_score=False, n_jobs=1, random_state=None, verbose=0) # , max_features=None

                print "LEN: ", len(train), len(target)

                if (False and LastClassifier != str(clf)[:10]
                        and (str(clf).startswith('RandomForest')
                             or str(clf).startswith('ExtraTrees'))):

                    clf.fit(train, target)

                    LastClassifier = str(clf)[:10]
                    print "Computing Importances"
                    importances = clf.feature_importances_
                    #print importances
                    importancesTemp = sorted(importances, reverse=True)
                    print len(importancesTemp), "importances"

                    if (len(importancesTemp) > NumFeatures):
                        threshold = importancesTemp[NumFeatures]
                        #print "Sorted and deleted importances"
                        #print importancesTemp

                        for row in train:
                            newRow = []
                            for impIndex, importance in enumerate(importances):
                                if (importance > threshold):
                                    newRow.append(row[impIndex])
                            trainNew.append(newRow)

                        for row in trainTest:
                            newRow = []
                            for impIndex, importance in enumerate(importances):
                                if (importance > threshold):
                                    newRow.append(row[impIndex])
                            trainTestNew.append(newRow)

                        for row in test:
                            newRow = []
                            for impIndex, importance in enumerate(importances):
                                if (importance > threshold):
                                    #print impIndex, len(importances)
                                    newRow.append(row[impIndex])
                            testNew.append(newRow)

                    else:
                        trainNew = train
                        trainTestNew = trainTest
                        testNew = test
                else:
                    #trainNew = fs.transform(train)
                    #trainTestNew = fs.transform(trainTest)
                    #testNew = fs.transform(test)
                    trainNew = train
                    trainTestNew = trainTest
                    testNew = test

                clf.fit(trainNew, target)

                prob = clf.predict_proba(trainTestNew)

                dataset_blend_train[test_index, ExecutionIndex] = prob[:, 1]

                probSum = 0
                totalOffByHalf = 0
                totalPositive = 0
                totalPositiveOffByHalf = 0
                totalPositivePredictions = 0

                for i in range(0, len(prob)):
                    probX = prob[i][1]  # [1]
                    if (probX > 0.999):
                        probX = 0.999
                    if (probX < 0.001):
                        probX = 0.001
                    #print i, probSum, probX, targetTest[i]
                    #print target[i]*log(probX), (1-target[i])*log(1-probX)
                    probSum += int(targetTest[i]) * log(probX) + (
                        1 - int(targetTest[i])) * log(1 - probX)
                    if (math.fabs(probX - int(targetTest[i])) > 0.5):
                        totalOffByHalf = totalOffByHalf + 1

                    if (int(targetTest[i]) == 1):
                        totalPositive = totalPositive + 1
                    if (int(targetTest[i]) == 1 and probX < 0.5):
                        totalPositiveOffByHalf = totalPositiveOffByHalf + 1
                    if (probX > 0.5):
                        totalPositivePredictions = totalPositivePredictions + 1

                print "Total Off By > 0.5 ", totalOffByHalf
                print "Total Positive ", totalPositive
                print "Total Positive Off By Half ", totalPositiveOffByHalf
                print "Total Positive Predictions ", totalPositivePredictions
                print -probSum / len(prob)

                avg += (-probSum / len(prob)) / NumFolds

                predicted_probs = clf.predict_proba(testNew)  # was test
                #print [x[1] for x in predicted_probs]
                predicted_list.append([x[1] for x in predicted_probs])

                dataset_blend_test_j[:, foldCount] = predicted_probs[:, 1]

                foldCount = foldCount + 1

            dataset_blend_test[:,
                               ExecutionIndex] = dataset_blend_test_j.mean(1)
            now = datetime.datetime.now()
            #csv_io.write_delimited_file_GUID("../Submissions/stack_avg" + now.strftime("%Y%m%d%H%M") + "_" + str(avg) + ".csv", "PreProcessData/test_PatientGuid.csv", dataset_blend_test_j.mean(1))

            print "------------------------------------------------Average: ", avg
            open("stack_gb_data.txt", "a").write(
                str(n_est) + ',' + str(learn_r) + ',' + str(avg) + "\n")

            avg_list = np.zeros(len(test))
            med_list = np.zeros(len(test))

            # For N folds, get the average/median for each prediction item in test set.
            for p in range(0, len(test)):
                temp_list = []
                for q in range(0, len(predicted_list)):
                    temp_list.append(predicted_list[q][p])

                avg_list[p] = mean(temp_list)
                med_list[p] = getMedian(temp_list)

                #print p, q, temp_list, mean(temp_list), getMedian(temp_list)

            bootstrapLists.append(avg_list)

            ExecutionIndex = ExecutionIndex + 1

    # This would be used if we ran multiple runs with different training values.
    # Primitive stacking, should rather save data, and do formal stacking.
    if (len(bootstrapLists) > 1):
        finalList = []
        for p in range(0, len(test)):
            temp_list = []
            for q in range(0, len(bootstrapLists)):
                temp_list.append(bootstrapLists[q][p])

            finalList.append(meanSpan(temp_list, spanDistance))

            #print p, q, temp_list, meanSpan(temp_list, spanDistance)
    else:
        finalList = bootstrapLists[0]

    #finalList = SimpleScale(finalList)
    avg_values = ["%f" % x for x in finalList]
    csv_io.write_delimited_file_GUID("../Submissions/gb_5fold_avg.csv",
                                     "PreProcessData/test_PatientGuid.csv",
                                     avg_values)

    #for rec in dataset_blend_train:
    #	print rec

    return dataset_blend_train, dataset_blend_test
Esempio n. 7
0
def Blend():

    trainBase = csv_io.read_data("PreProcessData/training_PreProcess2.csv",
                                 False)

    SEED = 448
    random.seed(SEED)
    random.shuffle(trainBase)

    target = [x[0] for x in trainBase]

    dataset_blend_train, dataset_blend_test = stack_rf.run_stack(SEED)

    clf = LogisticRegression()
    clf.fit(dataset_blend_train, target)
    submission = clf.predict_proba(dataset_blend_test)[:, 1]

    submission = ["%f" % x for x in submission]
    now = datetime.datetime.now()
    csv_io.write_delimited_file_GUID(
        "../Submissions/stack" + now.strftime("%Y%m%d%H%M") + ".csv",
        "PreProcessData/test_PatientGuid.csv", submission)

    # attempt to score the training set to predict score for blend...
    probSum = 0.0
    trainPrediction = clf.predict_proba(dataset_blend_train)[:, 1]
    for i in range(0, len(trainPrediction)):
        probX = trainPrediction[i]
        if (probX > 0.999):
            probX = 0.999
        if (probX < 0.001):
            probX = 0.001

        probSum += int(
            target[i]) * log(probX) + (1 - int(target[i])) * log(1 - probX)

    print "Train Score: ", (-probSum / len(trainPrediction))

    trainPredictionNew = stack_rf.SimpleScale(trainPrediction,
                                              floor=0.001,
                                              ceiling=0.999)
    probSum = 0.0
    for i in range(0, len(trainPredictionNew)):
        probX = trainPredictionNew[i]
        if (probX > 0.999):
            probX = 0.999
        if (probX < 0.001):
            probX = 0.001

        probSum += int(
            target[i]) * log(probX) + (1 - int(target[i])) * log(1 - probX)

    print "Train Score for 0.999 and 0.001 with SimpleScale: ", (
        -probSum / len(trainPredictionNew))

    trainPredictionNew = stack_rf.SimpleScale(trainPrediction,
                                              floor=0.01,
                                              ceiling=0.99)
    probSum = 0.0
    for i in range(0, len(trainPredictionNew)):
        probX = trainPredictionNew[i]
        if (probX > 0.999):
            probX = 0.999
        if (probX < 0.001):
            probX = 0.001

        probSum += int(
            target[i]) * log(probX) + (1 - int(target[i])) * log(1 - probX)

    print "Train Score for 0.99 and 0.01 with SimpleScale: ", (
        -probSum / len(trainPredictionNew))

    trainPredictionNew = stack_rf.SimpleScale(trainPrediction,
                                              floor=0.05,
                                              ceiling=0.95)
    probSum = 0.0
    for i in range(0, len(trainPredictionNew)):
        probX = trainPredictionNew[i]
        if (probX > 0.999):
            probX = 0.999
        if (probX < 0.001):
            probX = 0.001

        probSum += int(
            target[i]) * log(probX) + (1 - int(target[i])) * log(1 - probX)

    print "Train Score for 0.95 and 0.05 with SimpleScale: ", (
        -probSum / len(trainPredictionNew))

    submissionNew = stack_rf.SimpleScale(submission,
                                         floor=0.001,
                                         ceiling=0.999)
    csv_io.write_delimited_file_GUID(
        "../Submissions/stack" + now.strftime("%Y%m%d%H%M") +
        "_SimpleScale999.csv", "PreProcessData/test_PatientGuid.csv",
        submissionNew)

    submissionNew = stack_rf.SimpleScale(submission, floor=0.01, ceiling=0.99)
    csv_io.write_delimited_file_GUID(
        "../Submissions/stack" + now.strftime("%Y%m%d%H%M") +
        "_SimpleScale99.csv", "PreProcessData/test_PatientGuid.csv",
        submissionNew)

    submissionNew = stack_rf.SimpleScale(submission, floor=0.05, ceiling=0.95)
    csv_io.write_delimited_file_GUID(
        "../Submissions/stack" + now.strftime("%Y%m%d%H%M") +
        "_SimpleScale95.csv", "PreProcessData/test_PatientGuid.csv",
        submissionNew)

    var = raw_input("Enter to terminate.")
Esempio n. 8
0
File: stack.py Progetto: mb16/Kaggle
def run_stack(SEED):

	print "Running GB, RF, ET stack x2"

	trainBase = csv_io.read_data("PreProcessData/training_PreProcess2.csv", False)
	test = csv_io.read_data("PreProcessData/test_PreProcess2.csv", False)
	
	random.seed(SEED)
	random.shuffle(trainBase)
	
	avg = 0
	NumFolds = 10 # should be odd for median

	NumFeatures = 1000

	predicted_list = []
	
	spanDistance = 12
	bootstrapLists = []
	

	#clfs = [GradientBoostingClassifier(learn_rate=0.05, subsample=0.5, max_depth=6, n_estimators=50),
	#		GradientBoostingClassifier(learn_rate=0.05, subsample=0.5, max_depth=6, n_estimators=50)
	#		]
	# try to vary n_est
	#clfs = [GradientBoostingClassifier(learn_rate=0.05, subsample=0.5, max_depth=6, n_estimators=50),
	#		GradientBoostingClassifier(learn_rate=0.02, subsample=0.2, max_depth=8, n_estimators=125),
	#		RandomForestClassifier(n_estimators=100, n_jobs=1, criterion='gini'),
	#		RandomForestClassifier(n_estimators=100, n_jobs=1, criterion='entropy')]

	clfs = [GradientBoostingClassifier(learn_rate=0.05, subsample=0.5, max_depth=6, n_estimators=50),
			GradientBoostingClassifier(learn_rate=0.02, subsample=0.2, max_depth=8, n_estimators=125),
			RandomForestClassifier(n_estimators=100, n_jobs=1, criterion='gini'),
			RandomForestClassifier(n_estimators=100, n_jobs=1, criterion='entropy')]		
			
			
	# note, can use 50, 100, 150, 200 for n_estimators for ET and RF
	
	# clfs = [ExtraTreesClassifier(n_estimators=150, min_density=0.2, n_jobs=1, criterion='gini', bootstrap=True, random_state=1),
			# ExtraTreesClassifier(n_estimators=150, min_density=0.09, n_jobs=1, criterion='gini', bootstrap=True, random_state=2),
			# ExtraTreesClassifier(n_estimators=150, min_density=0.05, n_jobs=1, criterion='gini', bootstrap=True, random_state=3),
			# ExtraTreesClassifier(n_estimators=150, min_density=0.02, n_jobs=1, criterion='gini', bootstrap=True, random_state=4),
			# ExtraTreesClassifier(n_estimators=150, min_density=0.2, n_jobs=1, criterion='entropy', bootstrap=True, random_state=5),
			# ExtraTreesClassifier(n_estimators=150, min_density=0.09, n_jobs=1, criterion='entropy', bootstrap=True, random_state=6),
			# ExtraTreesClassifier(n_estimators=150, min_density=0.05, n_jobs=1, criterion='entropy', bootstrap=True, random_state=7),
			# ExtraTreesClassifier(n_estimators=150, min_density=0.02, n_jobs=1, criterion='entropy', bootstrap=True, random_state=8),
			# GradientBoostingClassifier(learn_rate=0.05, subsample=0.5, max_depth=6, n_estimators=50),
			# GradientBoostingClassifier(learn_rate=0.02, subsample=0.2, max_depth=8, n_estimators=125),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=80),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=3, n_estimators=80),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=4, n_estimators=80),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=120),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=3, n_estimators=120),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=4, n_estimators=120),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=160),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=3, n_estimators=160),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=4, n_estimators=160),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=1, n_estimators=200),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=200),
			# GradientBoostingClassifier(lesarn_rate=0.01, subsample=0.2, max_depth=3, n_estimators=200),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=4, n_estimators=200),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=1, n_estimators=240),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=240),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=3, n_estimators=240),			
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=1, n_estimators=280),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=280),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=3, n_estimators=280),			
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=1, n_estimators=320),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=320),
			# RandomForestClassifier(n_estimators=150, min_density=0.2, n_jobs=1, criterion='gini', bootstrap=True, random_state=1),
			# RandomForestClassifier(n_estimators=150, min_density=0.09, n_jobs=1, criterion='gini', bootstrap=True, random_state=2),
			# RandomForestClassifier(n_estimators=150, min_density=0.05, n_jobs=1, criterion='gini', bootstrap=True, random_state=3),
			# RandomForestClassifier(n_estimators=150, min_density=0.02, n_jobs=1, criterion='gini', bootstrap=True, random_state=4),
			# RandomForestClassifier(n_estimators=150, min_density=0.2, n_jobs=1, criterion='entropy', bootstrap=True, random_state=5),
			# RandomForestClassifier(n_estimators=150, min_density=0.09, n_jobs=1, criterion='entropy', bootstrap=True, random_state=6),
			# RandomForestClassifier(n_estimators=150, min_density=0.05, n_jobs=1, criterion='entropy', bootstrap=True, random_state=7),
			# RandomForestClassifier(n_estimators=150, min_density=0.02, n_jobs=1, criterion='entropy', bootstrap=True, random_state=8)]
	

			
	#clfs = [GradientBoostingClassifier(learn_rate=0.2, subsample=0.2, max_depth=8, n_estimators=80),
	#		GradientBoostingClassifier(learn_rate=0.1, subsample=0.2, max_depth=8, n_estimators=160),
	#		GradientBoostingClassifier(learn_rate=0.1, subsample=0.2, max_depth=8, n_estimators=320),
	#		GradientBoostingClassifier(learn_rate=0.05, subsample=0.2, max_depth=8, n_estimators=640),
	#		RandomForestClassifier(n_estimators=100, n_jobs=1, criterion='gini',compute_importances=True),
	#		RandomForestClassifier(n_estimators=100, n_jobs=1, criterion='entropy',compute_importances=True),
    #        ExtraTreesClassifier(n_estimators=100, n_jobs=1, criterion='gini', compute_importances=True),
    #        ExtraTreesClassifier(n_estimators=100, n_jobs=1, criterion='entropy',compute_importances=True)]
	
	
	print len(trainBase), len(test)
	dataset_blend_train = np.zeros((len(trainBase), len(clfs)))
	dataset_blend_test = np.zeros((len(test), len(clfs)))
	

	trainNew = []
	trainTestNew = []
	testNew = []
	trainNewSelect = []
	trainTestNewSelect = []
	testNewSelect = []
	
	print "Start Feaure Select"
	#f_classif(np.array([x[1:] for x in trainBase]), np.array([x[0] for x in trainBase]))
	#print "done1"
	#fs = SelectKBest(chi2, k=NumFeatures)
	#fs.fit(scipy.array([x[1:] for x in trainBase]), scipy.array([x[0] for x in trainBase]))
	#fs.fit(np.array([x[1:] for x in trainBase]), np.array([x[0] for x in trainBase]))
	print "End Feaure Select"	
	
	LastClassifier = ""

	for ExecutionIndex, clf in enumerate(clfs):
		print clf
		avg = 0
	
		predicted_list = []
			
		dataset_blend_test_j = np.zeros((len(test), NumFolds))
		
		foldCount = 0
		
		#print [trainBase[i][0] for i in range(len(trainBase))]
		#Folds = cross_validation.KFold(len(trainBase) - 1, k=NumFolds, indices=True, shuffle=False, random_state=None)	

		#StratifiedShuffleSplit has much poorer performance than StratifiedKFold
		#NOTE, the shuffle and bootstrap don't promise all elements are used in training, and then the blend has missing values which means it won't predicte correctly.
		#Folds = StratifiedShuffleSplit([trainBase[i][0] for i in range(len(trainBase))], NumFolds, indices=True)
		#Folds = cross_validation.Bootstrap(len(trainBase), n_bootstraps=5, train_size=0.8, random_state=0)
		Folds = cross_validation.StratifiedKFold([trainBase[i][0] for i in range(len(trainBase))], k=NumFolds, indices=True)
		for train_index, test_index in Folds:

			trainBaseTemp = [trainBase[i] for i in train_index]
			target = [x[0] for x in trainBaseTemp]
			train = [x[1:] for x in trainBaseTemp]
	
			testBaseTemp = [trainBase[i] for i in test_index]
			targetTest = [x[0] for x in testBaseTemp]
			trainTest = [x[1:] for x in testBaseTemp]
	
	

			test = [x[0:] for x in test]
	

			#rf = RandomForestClassifier(n_estimators=n_est, criterion='entropy', max_depth=None, min_samples_split=1, min_samples_leaf=1, min_density=0.10000000000000001, max_features='auto', bootstrap=True, compute_importances=False, oob_score=False, n_jobs=1, random_state=None, verbose=0) # , max_features=None

			print "LEN: ", len(train), len(target)
			

			
			if (False and LastClassifier !=  str(clf)[:10] and (str(clf).startswith( 'RandomForest' ) or str(clf).startswith( 'ExtraTrees' ))) :

				clf.fit(train, target)
			
				LastClassifier = str(clf)[:10]
				print "Computing Importances"
				importances = clf.feature_importances_
				#print importances
				importancesTemp = sorted(importances, reverse=True)
				print len(importancesTemp), "importances"
				
				if ( len(importancesTemp) > NumFeatures):
					threshold = importancesTemp[NumFeatures]
					#print "Sorted and deleted importances"
					#print importancesTemp

					for row in train:
						newRow = []
						for impIndex, importance in enumerate(importances):
							if ( importance > threshold ) :	
								newRow.append(row[impIndex])
						trainNew.append(newRow)	

					for row in trainTest:
						newRow = []
						for impIndex, importance in enumerate(importances):
							if ( importance > threshold ) :
								newRow.append(row[impIndex])
						trainTestNew.append(newRow)	

					for row in test:
						newRow = []
						for impIndex, importance in enumerate(importances):
							if ( importance > threshold ) :
								#print impIndex, len(importances)
								newRow.append(row[impIndex])
						testNew.append(newRow)	
				
				else:
					trainNew = train
					trainTestNew = trainTest
					testNew = test	
			else:
				#trainNew = fs.transform(train)
				#trainTestNew = fs.transform(trainTest)
				#testNew = fs.transform(test)
				trainNew = train
				trainTestNew = trainTest
				testNew = test

			clf.fit(trainNew, target)



			prob = clf.predict_proba(trainTestNew) 
			
			dataset_blend_train[test_index, ExecutionIndex] = prob[:,1] 
			
	
			probSum = 0
			totalOffByHalf = 0
			totalPositive = 0
			totalPositiveOffByHalf = 0
			totalPositivePredictions = 0
			
			for i in range(0, len(prob)):
				probX = prob[i][1] # [1]
				if ( probX > 0.999):
					probX = 0.999;		
				if ( probX < 0.001):
					probX = 0.001;
				#print i, probSum, probX, targetTest[i]
				#print target[i]*log(probX), (1-target[i])*log(1-probX)
				probSum += int(targetTest[i])*log(probX)+(1-int(targetTest[i]))*log(1-probX)
				if ( math.fabs(probX - int(targetTest[i])) > 0.5 ):
					totalOffByHalf = totalOffByHalf + 1		
			
				if ( int(targetTest[i]) == 1 ):
					totalPositive = totalPositive + 1
				if ( int(targetTest[i]) == 1 and probX < 0.5):
					totalPositiveOffByHalf = totalPositiveOffByHalf + 1
				if (probX > 0.5):
					totalPositivePredictions = totalPositivePredictions + 1			
			
			print "Total Off By > 0.5 ", totalOffByHalf
			print "Total Positive ", totalPositive
			print "Total Positive Off By Half ", totalPositiveOffByHalf
			print "Total Positive Predictions ", totalPositivePredictions
			print -probSum/len(prob)
	
 
			avg += 	(-probSum/len(prob))/NumFolds

			predicted_probs = clf.predict_proba(testNew)  # was test						
			#print [x[1] for x in predicted_probs]
			predicted_list.append([x[1] for x in predicted_probs])
		
			dataset_blend_test_j[:, foldCount] = predicted_probs[:,1]
		
			foldCount = foldCount + 1
		
		dataset_blend_test[:,ExecutionIndex] = dataset_blend_test_j.mean(1)  # try median here (seems not implemented)
		
		now = datetime.datetime.now()
		#csv_io.write_delimited_file_GUID("../Submissions/stack_avg" + now.strftime("%Y%m%d%H%M") + "_" + str(avg) + ".csv", "PreProcessData/test_PatientGuid.csv", dataset_blend_test_j.mean(1))
		
		print "------------------------Average: ", avg

		avg_list = np.zeros(len(test))
		med_list = np.zeros(len(test))
	
		# For N folds, get the average/median for each prediction item in test set.
		for p in range(0, len(test)):
			temp_list =[]	
			for q in range(0, len(predicted_list)):		
				temp_list.append(  predicted_list[q][p]) 
			
			avg_list[p] = mean(temp_list) 
			med_list[p] = getMedian(temp_list) 
		
			#print p, q, temp_list, mean(temp_list), getMedian(temp_list)
			

		bootstrapLists.append(avg_list)

		
	# This would be used if we ran multiple runs with different training values.
	# Primitive stacking, should rather save data, and do formal stacking.
	if ( len(bootstrapLists) > 1 ):
		finalList = []
		for p in range(0, len(test)):
			temp_list =[]	
			for q in range(0, len(bootstrapLists)):		
				temp_list.append(  bootstrapLists[q][p]) 
			
			finalList.append( meanSpan(temp_list, spanDistance) )
		
			#print p, q, temp_list, meanSpan(temp_list, spanDistance)
	else:
		finalList = bootstrapLists[0]		
		
	#finalList = SimpleScale(finalList)
	avg_values = ["%f" % x for x in finalList]
	csv_io.write_delimited_file_GUID("../Submissions/rf2_5fold_avg.csv", "PreProcessData/test_PatientGuid.csv", avg_values)	
	
	#for rec in dataset_blend_train:
	#	print rec
	
	return dataset_blend_train, dataset_blend_test
Esempio n. 9
0
def run_stack(SEED):

    print "Running GB, RF, ET stack x2"

    trainBase = csv_io.read_data("PreProcessData/training_PreProcess2.csv",
                                 False)
    test = csv_io.read_data("PreProcessData/test_PreProcess2.csv", False)

    random.seed(SEED)
    random.shuffle(trainBase)

    avg = 0
    NumFolds = 10  # should be odd for median

    NumFeatures = 1000

    predicted_list = []

    spanDistance = 12
    bootstrapLists = []

    #clfs = [GradientBoostingClassifier(learn_rate=0.05, subsample=0.5, max_depth=6, n_estimators=50),
    #		GradientBoostingClassifier(learn_rate=0.05, subsample=0.5, max_depth=6, n_estimators=50)
    #		]
    # try to vary n_est
    #clfs = [GradientBoostingClassifier(learn_rate=0.05, subsample=0.5, max_depth=6, n_estimators=50),
    #		GradientBoostingClassifier(learn_rate=0.02, subsample=0.2, max_depth=8, n_estimators=125),
    #		RandomForestClassifier(n_estimators=100, n_jobs=1, criterion='gini'),
    #		RandomForestClassifier(n_estimators=100, n_jobs=1, criterion='entropy')]

    clfs = [
        GradientBoostingClassifier(learn_rate=0.05,
                                   subsample=0.5,
                                   max_depth=6,
                                   n_estimators=50),
        GradientBoostingClassifier(learn_rate=0.02,
                                   subsample=0.2,
                                   max_depth=8,
                                   n_estimators=125),
        RandomForestClassifier(n_estimators=100, n_jobs=1, criterion='gini'),
        RandomForestClassifier(n_estimators=100, n_jobs=1, criterion='entropy')
    ]

    # note, can use 50, 100, 150, 200 for n_estimators for ET and RF

    # clfs = [ExtraTreesClassifier(n_estimators=150, min_density=0.2, n_jobs=1, criterion='gini', bootstrap=True, random_state=1),
    # ExtraTreesClassifier(n_estimators=150, min_density=0.09, n_jobs=1, criterion='gini', bootstrap=True, random_state=2),
    # ExtraTreesClassifier(n_estimators=150, min_density=0.05, n_jobs=1, criterion='gini', bootstrap=True, random_state=3),
    # ExtraTreesClassifier(n_estimators=150, min_density=0.02, n_jobs=1, criterion='gini', bootstrap=True, random_state=4),
    # ExtraTreesClassifier(n_estimators=150, min_density=0.2, n_jobs=1, criterion='entropy', bootstrap=True, random_state=5),
    # ExtraTreesClassifier(n_estimators=150, min_density=0.09, n_jobs=1, criterion='entropy', bootstrap=True, random_state=6),
    # ExtraTreesClassifier(n_estimators=150, min_density=0.05, n_jobs=1, criterion='entropy', bootstrap=True, random_state=7),
    # ExtraTreesClassifier(n_estimators=150, min_density=0.02, n_jobs=1, criterion='entropy', bootstrap=True, random_state=8),
    # GradientBoostingClassifier(learn_rate=0.05, subsample=0.5, max_depth=6, n_estimators=50),
    # GradientBoostingClassifier(learn_rate=0.02, subsample=0.2, max_depth=8, n_estimators=125),
    # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=80),
    # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=3, n_estimators=80),
    # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=4, n_estimators=80),
    # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=120),
    # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=3, n_estimators=120),
    # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=4, n_estimators=120),
    # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=160),
    # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=3, n_estimators=160),
    # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=4, n_estimators=160),
    # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=1, n_estimators=200),
    # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=200),
    # GradientBoostingClassifier(lesarn_rate=0.01, subsample=0.2, max_depth=3, n_estimators=200),
    # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=4, n_estimators=200),
    # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=1, n_estimators=240),
    # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=240),
    # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=3, n_estimators=240),
    # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=1, n_estimators=280),
    # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=280),
    # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=3, n_estimators=280),
    # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=1, n_estimators=320),
    # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=320),
    # RandomForestClassifier(n_estimators=150, min_density=0.2, n_jobs=1, criterion='gini', bootstrap=True, random_state=1),
    # RandomForestClassifier(n_estimators=150, min_density=0.09, n_jobs=1, criterion='gini', bootstrap=True, random_state=2),
    # RandomForestClassifier(n_estimators=150, min_density=0.05, n_jobs=1, criterion='gini', bootstrap=True, random_state=3),
    # RandomForestClassifier(n_estimators=150, min_density=0.02, n_jobs=1, criterion='gini', bootstrap=True, random_state=4),
    # RandomForestClassifier(n_estimators=150, min_density=0.2, n_jobs=1, criterion='entropy', bootstrap=True, random_state=5),
    # RandomForestClassifier(n_estimators=150, min_density=0.09, n_jobs=1, criterion='entropy', bootstrap=True, random_state=6),
    # RandomForestClassifier(n_estimators=150, min_density=0.05, n_jobs=1, criterion='entropy', bootstrap=True, random_state=7),
    # RandomForestClassifier(n_estimators=150, min_density=0.02, n_jobs=1, criterion='entropy', bootstrap=True, random_state=8)]

    #clfs = [GradientBoostingClassifier(learn_rate=0.2, subsample=0.2, max_depth=8, n_estimators=80),
    #		GradientBoostingClassifier(learn_rate=0.1, subsample=0.2, max_depth=8, n_estimators=160),
    #		GradientBoostingClassifier(learn_rate=0.1, subsample=0.2, max_depth=8, n_estimators=320),
    #		GradientBoostingClassifier(learn_rate=0.05, subsample=0.2, max_depth=8, n_estimators=640),
    #		RandomForestClassifier(n_estimators=100, n_jobs=1, criterion='gini',compute_importances=True),
    #		RandomForestClassifier(n_estimators=100, n_jobs=1, criterion='entropy',compute_importances=True),
    #        ExtraTreesClassifier(n_estimators=100, n_jobs=1, criterion='gini', compute_importances=True),
    #        ExtraTreesClassifier(n_estimators=100, n_jobs=1, criterion='entropy',compute_importances=True)]

    print len(trainBase), len(test)
    dataset_blend_train = np.zeros((len(trainBase), len(clfs)))
    dataset_blend_test = np.zeros((len(test), len(clfs)))

    trainNew = []
    trainTestNew = []
    testNew = []
    trainNewSelect = []
    trainTestNewSelect = []
    testNewSelect = []

    print "Start Feaure Select"
    #f_classif(np.array([x[1:] for x in trainBase]), np.array([x[0] for x in trainBase]))
    #print "done1"
    #fs = SelectKBest(chi2, k=NumFeatures)
    #fs.fit(scipy.array([x[1:] for x in trainBase]), scipy.array([x[0] for x in trainBase]))
    #fs.fit(np.array([x[1:] for x in trainBase]), np.array([x[0] for x in trainBase]))
    print "End Feaure Select"

    LastClassifier = ""

    for ExecutionIndex, clf in enumerate(clfs):
        print clf
        avg = 0

        predicted_list = []

        dataset_blend_test_j = np.zeros((len(test), NumFolds))

        foldCount = 0

        #print [trainBase[i][0] for i in range(len(trainBase))]
        #Folds = cross_validation.KFold(len(trainBase) - 1, k=NumFolds, indices=True, shuffle=False, random_state=None)

        #StratifiedShuffleSplit has much poorer performance than StratifiedKFold
        #NOTE, the shuffle and bootstrap don't promise all elements are used in training, and then the blend has missing values which means it won't predicte correctly.
        #Folds = StratifiedShuffleSplit([trainBase[i][0] for i in range(len(trainBase))], NumFolds, indices=True)
        #Folds = cross_validation.Bootstrap(len(trainBase), n_bootstraps=5, train_size=0.8, random_state=0)
        Folds = cross_validation.StratifiedKFold(
            [trainBase[i][0] for i in range(len(trainBase))],
            k=NumFolds,
            indices=True)
        for train_index, test_index in Folds:

            trainBaseTemp = [trainBase[i] for i in train_index]
            target = [x[0] for x in trainBaseTemp]
            train = [x[1:] for x in trainBaseTemp]

            testBaseTemp = [trainBase[i] for i in test_index]
            targetTest = [x[0] for x in testBaseTemp]
            trainTest = [x[1:] for x in testBaseTemp]

            test = [x[0:] for x in test]

            #rf = RandomForestClassifier(n_estimators=n_est, criterion='entropy', max_depth=None, min_samples_split=1, min_samples_leaf=1, min_density=0.10000000000000001, max_features='auto', bootstrap=True, compute_importances=False, oob_score=False, n_jobs=1, random_state=None, verbose=0) # , max_features=None

            print "LEN: ", len(train), len(target)

            if (False and LastClassifier != str(clf)[:10]
                    and (str(clf).startswith('RandomForest')
                         or str(clf).startswith('ExtraTrees'))):

                clf.fit(train, target)

                LastClassifier = str(clf)[:10]
                print "Computing Importances"
                importances = clf.feature_importances_
                #print importances
                importancesTemp = sorted(importances, reverse=True)
                print len(importancesTemp), "importances"

                if (len(importancesTemp) > NumFeatures):
                    threshold = importancesTemp[NumFeatures]
                    #print "Sorted and deleted importances"
                    #print importancesTemp

                    for row in train:
                        newRow = []
                        for impIndex, importance in enumerate(importances):
                            if (importance > threshold):
                                newRow.append(row[impIndex])
                        trainNew.append(newRow)

                    for row in trainTest:
                        newRow = []
                        for impIndex, importance in enumerate(importances):
                            if (importance > threshold):
                                newRow.append(row[impIndex])
                        trainTestNew.append(newRow)

                    for row in test:
                        newRow = []
                        for impIndex, importance in enumerate(importances):
                            if (importance > threshold):
                                #print impIndex, len(importances)
                                newRow.append(row[impIndex])
                        testNew.append(newRow)

                else:
                    trainNew = train
                    trainTestNew = trainTest
                    testNew = test
            else:
                #trainNew = fs.transform(train)
                #trainTestNew = fs.transform(trainTest)
                #testNew = fs.transform(test)
                trainNew = train
                trainTestNew = trainTest
                testNew = test

            clf.fit(trainNew, target)

            prob = clf.predict_proba(trainTestNew)

            dataset_blend_train[test_index, ExecutionIndex] = prob[:, 1]

            probSum = 0
            totalOffByHalf = 0
            totalPositive = 0
            totalPositiveOffByHalf = 0
            totalPositivePredictions = 0

            for i in range(0, len(prob)):
                probX = prob[i][1]  # [1]
                if (probX > 0.999):
                    probX = 0.999
                if (probX < 0.001):
                    probX = 0.001
                #print i, probSum, probX, targetTest[i]
                #print target[i]*log(probX), (1-target[i])*log(1-probX)
                probSum += int(targetTest[i]) * log(probX) + (
                    1 - int(targetTest[i])) * log(1 - probX)
                if (math.fabs(probX - int(targetTest[i])) > 0.5):
                    totalOffByHalf = totalOffByHalf + 1

                if (int(targetTest[i]) == 1):
                    totalPositive = totalPositive + 1
                if (int(targetTest[i]) == 1 and probX < 0.5):
                    totalPositiveOffByHalf = totalPositiveOffByHalf + 1
                if (probX > 0.5):
                    totalPositivePredictions = totalPositivePredictions + 1

            print "Total Off By > 0.5 ", totalOffByHalf
            print "Total Positive ", totalPositive
            print "Total Positive Off By Half ", totalPositiveOffByHalf
            print "Total Positive Predictions ", totalPositivePredictions
            print -probSum / len(prob)

            avg += (-probSum / len(prob)) / NumFolds

            predicted_probs = clf.predict_proba(testNew)  # was test
            #print [x[1] for x in predicted_probs]
            predicted_list.append([x[1] for x in predicted_probs])

            dataset_blend_test_j[:, foldCount] = predicted_probs[:, 1]

            foldCount = foldCount + 1

        dataset_blend_test[:, ExecutionIndex] = dataset_blend_test_j.mean(
            1)  # try median here (seems not implemented)

        now = datetime.datetime.now()
        #csv_io.write_delimited_file_GUID("../Submissions/stack_avg" + now.strftime("%Y%m%d%H%M") + "_" + str(avg) + ".csv", "PreProcessData/test_PatientGuid.csv", dataset_blend_test_j.mean(1))

        print "------------------------Average: ", avg

        avg_list = np.zeros(len(test))
        med_list = np.zeros(len(test))

        # For N folds, get the average/median for each prediction item in test set.
        for p in range(0, len(test)):
            temp_list = []
            for q in range(0, len(predicted_list)):
                temp_list.append(predicted_list[q][p])

            avg_list[p] = mean(temp_list)
            med_list[p] = getMedian(temp_list)

            #print p, q, temp_list, mean(temp_list), getMedian(temp_list)

        bootstrapLists.append(avg_list)

    # This would be used if we ran multiple runs with different training values.
    # Primitive stacking, should rather save data, and do formal stacking.
    if (len(bootstrapLists) > 1):
        finalList = []
        for p in range(0, len(test)):
            temp_list = []
            for q in range(0, len(bootstrapLists)):
                temp_list.append(bootstrapLists[q][p])

            finalList.append(meanSpan(temp_list, spanDistance))

            #print p, q, temp_list, meanSpan(temp_list, spanDistance)
    else:
        finalList = bootstrapLists[0]

    #finalList = SimpleScale(finalList)
    avg_values = ["%f" % x for x in finalList]
    csv_io.write_delimited_file_GUID("../Submissions/rf2_5fold_avg.csv",
                                     "PreProcessData/test_PatientGuid.csv",
                                     avg_values)

    #for rec in dataset_blend_train:
    #	print rec

    return dataset_blend_train, dataset_blend_test
Esempio n. 10
0
def run_stack():
	print "Running KNN Stack"
	
	trainBase = csv_io.read_data("PreProcessData/training_PreProcess2.csv", False)
	test = csv_io.read_data("PreProcessData/test_PreProcess2.csv", False)
	
	avg = 0
	NumFolds = 5 # should be odd for median

	NumFeatures = 1000

	predicted_list = []
	
	spanDistance = 12
	bootstrapLists = []
	
	#clfs = [GradientBoostingClassifier(learn_rate=0.05, subsample=0.5, max_depth=6, n_estimators=50),
	#		GradientBoostingClassifier(learn_rate=0.02, subsample=0.2, max_depth=8, n_estimators=125),
	#		RandomForestClassifier(n_estimators=100, n_jobs=1, criterion='gini',compute_importances=True),
    #        RandomForestClassifier(n_estimators=100, n_jobs=1, criterion='entropy',compute_importances=True),
    #        ExtraTreesClassifier(n_estimators=100, n_jobs=1, criterion='gini', compute_importances=True),
    #        ExtraTreesClassifier(n_estimators=100, n_jobs=1, criterion='entropy',compute_importances=True)]
		
	rnd_start = 456
	
	CC = [10,30,50,100]
	gg = [0]	
	
	print len(trainBase), len(test)
	dataset_blend_train = np.zeros((len(trainBase), len(CC)*len(gg)))
	dataset_blend_test = np.zeros((len(test), len(CC)*len(gg)))
	

	trainNew = []
	trainTestNew = []
	testNew = []
	trainNewSelect = []
	trainTestNewSelect = []
	testNewSelect = []
	
	print "Start Feaure Select"
	#f_classif(np.array([x[1:] for x in trainBase]), np.array([x[0] for x in trainBase]))
	#print "done1"
	#fs = SelectKBest(chi2, k=NumFeatures)
	#fs.fit(scipy.array([x[1:] for x in trainBase]), scipy.array([x[0] for x in trainBase]))
	#fs.fit(np.array([x[1:] for x in trainBase]), np.array([x[0] for x in trainBase]))
	print "End Feaure Select"	
	
	LastClassifier = ""
	ExecutionIndex = 0
	
	#for ExecutionIndex, clf in enumerate(clfs):
	for g in gg:	
		for C in CC:
			print "g ", g, " C " ,C
			clf = KNeighborsClassifier(n_neighbors=C, weights='uniform', algorithm='auto', leaf_size=30, warn_on_equidistant=False, p=2)

			print clf
			avg = 0
		
			predicted_list = []
				
			dataset_blend_test_j = np.zeros((len(test), NumFolds))
			
			foldCount = 0
			
			#print [trainBase[i][0] for i in range(len(trainBase))]
			#Folds = cross_validation.KFold(len(trainBase) - 1, k=NumFolds, indices=True, shuffle=False, random_state=None)		
			Folds = cross_validation.StratifiedKFold([trainBase[i][0] for i in range(len(trainBase))], k=NumFolds, indices=True)
			for train_index, test_index in Folds:

				trainBaseTemp = [trainBase[i] for i in train_index]
				target = [x[0] for x in trainBaseTemp]
				train = [x[1:] for x in trainBaseTemp]
		
				testBaseTemp = [trainBase[i] for i in test_index]
				targetTest = [x[0] for x in testBaseTemp]
				trainTest = [x[1:] for x in testBaseTemp]
		
		

				test = [x[0:] for x in test]
		

				#rf = RandomForestClassifier(n_estimators=n_est, criterion='entropy', max_depth=None, min_samples_split=1, min_samples_leaf=1, min_density=0.10000000000000001, max_features='auto', bootstrap=True, compute_importances=False, oob_score=False, n_jobs=1, random_state=None, verbose=0) # , max_features=None

				print "LEN: ", len(train), len(target)
				

				
				if (False and LastClassifier !=  str(clf)[:10] and (str(clf).startswith( 'RandomForest' ) or str(clf).startswith( 'ExtraTrees' ))) :

					clf.fit(train, target)
				
					LastClassifier = str(clf)[:10]
					print "Computing Importances"
					importances = clf.feature_importances_
					#print importances
					importancesTemp = sorted(importances, reverse=True)
					print len(importancesTemp), "importances"
					
					if ( len(importancesTemp) > NumFeatures):
						threshold = importancesTemp[NumFeatures]
						#print "Sorted and deleted importances"
						#print importancesTemp

						for row in train:
							newRow = []
							for impIndex, importance in enumerate(importances):
								if ( importance > threshold ) :	
									newRow.append(row[impIndex])
							trainNew.append(newRow)	

						for row in trainTest:
							newRow = []
							for impIndex, importance in enumerate(importances):
								if ( importance > threshold ) :
									newRow.append(row[impIndex])
							trainTestNew.append(newRow)	

						for row in test:
							newRow = []
							for impIndex, importance in enumerate(importances):
								if ( importance > threshold ) :
									#print impIndex, len(importances)
									newRow.append(row[impIndex])
							testNew.append(newRow)	
					
					else:
						trainNew = train
						trainTestNew = trainTest
						testNew = test	
				else:
					#trainNew = fs.transform(train)
					#trainTestNew = fs.transform(trainTest)
					#testNew = fs.transform(test)
					trainNew = train
					trainTestNew = trainTest
					testNew = test

				clf.fit(trainNew, target)



				prob = clf.predict_proba(trainTestNew) 
				
				dataset_blend_train[test_index, ExecutionIndex] = prob[:,1] 
				
		
				probSum = 0
				totalOffByHalf = 0
				totalPositive = 0
				totalPositiveOffByHalf = 0
				totalPositivePredictions = 0
				
				for i in range(0, len(prob)):
					probX = prob[i][1] # [1]
					if ( probX > 0.999):
						probX = 0.999;		
					if ( probX < 0.001):
						probX = 0.001;
					#print i, probSum, probX, targetTest[i]
					#print target[i]*log(probX), (1-target[i])*log(1-probX)
					probSum += int(targetTest[i])*log(probX)+(1-int(targetTest[i]))*log(1-probX)
					if ( math.fabs(probX - int(targetTest[i])) > 0.5 ):
						totalOffByHalf = totalOffByHalf + 1		
				
					if ( int(targetTest[i]) == 1 ):
						totalPositive = totalPositive + 1
					if ( int(targetTest[i]) == 1 and probX < 0.5):
						totalPositiveOffByHalf = totalPositiveOffByHalf + 1
					if (probX > 0.5):
						totalPositivePredictions = totalPositivePredictions + 1			
				
				print "Total Off By > 0.5 ", totalOffByHalf
				print "Total Positive ", totalPositive
				print "Total Positive Off By Half ", totalPositiveOffByHalf
				print "Total Positive Predictions ", totalPositivePredictions
				print -probSum/len(prob)
		
	 
				avg += 	(-probSum/len(prob))/NumFolds

				predicted_probs = clf.predict_proba(testNew)  # was test						
				#print [x[1] for x in predicted_probs]
				predicted_list.append([x[1] for x in predicted_probs])
			
				dataset_blend_test_j[:, foldCount] = predicted_probs[:,1]
			
				foldCount = foldCount + 1
			
				#break ## ****************************************************************** cut off cross folds to 1.
			
			dataset_blend_test[:,ExecutionIndex] = dataset_blend_test_j.mean(1)
			
			
			print "------------------------------------------------Average: ", avg
			open("stack_svm_poly_data.txt","a").write(str(g)+','+str(C)+','+str(avg)+"\n")
	
			avg_list = np.zeros(len(test))
			med_list = np.zeros(len(test))
		
			# For N folds, get the average/median for each prediction item in test set.
			for p in range(0, len(test)):
				temp_list =[]	
				for q in range(0, len(predicted_list)):		
					temp_list.append(  predicted_list[q][p]) 
				
				avg_list[p] = mean(temp_list) 
				med_list[p] = getMedian(temp_list) 
			
				#print p, q, temp_list, mean(temp_list), getMedian(temp_list)
				

			bootstrapLists.append(avg_list)

			ExecutionIndex = ExecutionIndex + 1
		
	# This would be used if we ran multiple runs with different training values.
	# Primitive stacking, should rather save data, and do formal stacking.
	if ( len(bootstrapLists) > 1 ):
		finalList = []
		for p in range(0, len(test)):
			temp_list =[]	
			for q in range(0, len(bootstrapLists)):		
				temp_list.append(  bootstrapLists[q][p]) 
			
			finalList.append( meanSpan(temp_list, spanDistance) )
		
			#print p, q, temp_list, meanSpan(temp_list, spanDistance)
	else:
		finalList = bootstrapLists[0]		
		
	#finalList = SimpleScale(finalList)
	avg_values = ["%f" % x for x in finalList]
	csv_io.write_delimited_file_GUID("../Submissions/gb_5fold_avg.csv", "PreProcessData/test_PatientGuid.csv", avg_values)	
	
	#for rec in dataset_blend_train:
	#	print rec
	
	return dataset_blend_train, dataset_blend_test
Esempio n. 11
0
def run_rf():

    trainBase = csv_io.read_data("PreProcessData/training_PreProcess2.csv",
                                 False)
    test = csv_io.read_data("PreProcessData/test_PreProcess2.csv", False)
    test = [x[0:] for x in test]

    avg = 0
    NumFolds = 5  # should be odd for median

    predicted_list = []

    spanDistance = 12
    bootstrapLists = []

    NEstimators = [150, 250]  # [50,100,150,200,300,400,500,600]

    ExecutionIndex = 0

    print len(trainBase), len(test)
    dataset_blend_train = np.zeros((len(trainBase), len(NEstimators)))
    dataset_blend_test = np.zeros((len(test), len(NEstimators)))

    for n_est in NEstimators:

        predicted_list = []
        avg = 0

        dataset_blend_test_j = np.zeros((len(test), NumFolds))

        foldCount = 0

        Folds = cross_validation.KFold(len(trainBase) - 1,
                                       k=NumFolds,
                                       indices=True,
                                       shuffle=False,
                                       random_state=None)
        for train_index, test_index in Folds:

            trainBaseTemp = [trainBase[i] for i in train_index]
            target = [x[0] for x in trainBaseTemp]
            train = [x[1:] for x in trainBaseTemp]

            testBaseTemp = [trainBase[i] for i in test_index]
            targetTest = [x[0] for x in testBaseTemp]
            trainTest = [x[1:] for x in testBaseTemp]

            rf = RandomForestClassifier(n_estimators=n_est,
                                        criterion='entropy',
                                        max_depth=None,
                                        min_samples_split=1,
                                        min_samples_leaf=1,
                                        min_density=0.10000000000000001,
                                        max_features='auto',
                                        bootstrap=True,
                                        compute_importances=False,
                                        oob_score=False,
                                        n_jobs=1,
                                        random_state=None,
                                        verbose=0)  # , max_features=None

            rf.fit(train, target)
            prob = rf.predict_proba(trainTest)
            dataset_blend_train[test_index, ExecutionIndex] = prob[:, 1]

            probSum = 0
            totalOffByHalf = 0
            totalPositive = 0
            totalPositiveOffByHalf = 0
            totalPositivePredictions = 0

            for i in range(0, len(prob)):
                probX = prob[i][1]  # [1]
                if (probX > 0.999):
                    probX = 0.999
                if (probX < 0.001):
                    probX = 0.001
                #print i, probSum, probX, targetTest[i]
                #print target[i]*log(probX), (1-target[i])*log(1-probX)
                probSum += int(targetTest[i]) * log(probX) + (
                    1 - int(targetTest[i])) * log(1 - probX)
                if (math.fabs(probX - int(targetTest[i])) > 0.5):
                    totalOffByHalf = totalOffByHalf + 1

                if (int(targetTest[i]) == 1):
                    totalPositive = totalPositive + 1
                if (int(targetTest[i]) == 1 and probX < 0.5):
                    totalPositiveOffByHalf = totalPositiveOffByHalf + 1
                if (probX > 0.5):
                    totalPositivePredictions = totalPositivePredictions + 1

            print "Total Off By > 0.5 ", totalOffByHalf
            print "Total Positive ", totalPositive
            print "Total Positive Off By Half ", totalPositiveOffByHalf
            print "Total Positive Predictions ", totalPositivePredictions
            print "NEstimators: ", n_est
            print -probSum / len(prob)

            avg += (-probSum / len(prob)) / NumFolds

            predicted_probs = rf.predict_proba(test)  # was test
            predicted_list.append([x[1] for x in predicted_probs])
            dataset_blend_test_j[:, foldCount] = predicted_probs[:, 1]
            foldCount = foldCount + 1

        dataset_blend_test[:, ExecutionIndex] = dataset_blend_test_j.mean(1)

        print "------------------------Average: ", avg

        avg_list = np.zeros(len(test))
        med_list = np.zeros(len(test))

        # For N folds, get the average/median for each prediction item in test set.
        for p in range(0, len(test)):
            temp_list = []
            for q in range(0, len(predicted_list)):
                temp_list.append(predicted_list[q][p])

            avg_list[p] = mean(temp_list)
            med_list[p] = getMedian(temp_list)

            #print p, q, temp_list, mean(temp_list), getMedian(temp_list)

        bootstrapLists.append(avg_list)

        ExecutionIndex = ExecutionIndex + 1

    # This would be used if we ran multiple runs with different training values.
    # Primitive stacking, should rather save data, and do formal stacking.
    if (len(bootstrapLists) > 1):
        finalList = []
        for p in range(0, len(test)):
            temp_list = []
            for q in range(0, len(bootstrapLists)):
                temp_list.append(bootstrapLists[q][p])

            finalList.append(meanSpan(temp_list, spanDistance))

            #print p, q, temp_list, meanSpan(temp_list, spanDistance)
    else:
        finalList = bootstrapLists[0]

    finalList = SimpleScale(finalList)
    avg_values = ["%f" % x for x in finalList]
    csv_io.write_delimited_file_GUID("../Submissions/rf2_5fold_avg.csv",
                                     "PreProcessData/test_PatientGuid.csv",
                                     avg_values)

    #for rec in dataset_blend_train:
    #	print rec

    return dataset_blend_train, dataset_blend_test