Example #1
0
def Analyze1():

	Threshold = 4.0  
	targetFile = "Target_Stack_20121017110223_3.06649134025_GradientBoos.csv"

	trainBase = csv_io.read_data("PreProcessData/training_PreProcess3.csv", skipFirstLine = False, split = "\t")
	shutil.copy2("PreProcessData/test_PreProcess3.csv", "PreProcessData/test_PreProcess8.csv")	
	shutil.copy2("PreProcessData/DataClassList3.csv", "PreProcessData/DataClassList8.csv")	
	weights = csv_io.read_data("PreProcessData/Weights.csv", skipFirstLine = False)
	
	target = [x[0] for x in trainBase]
	
	
	print "Loading Data"
	trainNew = []
	
	probSum = 0.0
	weightSum = 0
	
	trn = csv_io.read_data("../predictions/" + targetFile, split="," ,skipFirstLine = False)
	for row, datum in enumerate(trn):

		if ( abs(datum[0] - target[row]) > Threshold):
			print datum[0], target[row]
			trainNew.append(trainBase[row])
			
			probSum += weights[row][0] * math.fabs(target[row] - datum[0])
			weightSum += weights[row][0]
		
		
	print "Train Score: ", (probSum/weightSum)	
	print len(trainNew)
	csv_io.write_delimited_file("PreProcessData/training_PreProcess8" + ".csv", trainNew, delimiter="\t")	
Example #2
0
def main():
    #read in the training file
    train = csv_io.read_data("data/train.csv")
    target = ravel(csv_io.read_data("data/trainLabels.csv"))

    realtest = csv_io.read_data("data/test.csv")
    print len(realtest)

    # random forest code
    rf = RandomForestClassifier(n_estimators=150, min_samples_split=2, n_jobs=-1, random_state=1, oob_score=True)
    # fit the training data
    print('fitting the model')
    rf.fit(train, target)

    # run model against test data
    predicted_probs = rf.predict_proba(realtest)
    predicted_class = rf.predict(realtest)
    print predicted_class[1:10]
    print(len(predicted_class))

    predicted_probs = ["%f" % x[1] for x in predicted_probs]
    predicted_class = ["%d,%d" % (i+1, predicted_class[i]) for i in range(len(predicted_class))]
    print predicted_class[0:9]
    print(len(predicted_class))

    csv_io.write_delimited_file("results/random_forest_solution.csv", predicted_class, header=['Id', 'Solution'])
Example #3
0
def main():
    #read in  data, parse into training and target sets
    train = csv.read_data("../Data/train.csv")
    target = np.array( [x[0] for x in train] )
    train = np.array( [x[1:280] for x in train] )

    #In this case we'll use a random forest, but this could be any classifier
    cfr = RandomForestClassifier(n_estimators=120, min_samples_split=2, n_jobs=-1, max_depth=None) #.46
    #cfr = GradientBoostingClassifier(n_estimators=120, learn_rate=0.57, max_depth=1) #.50
    #cfr = ExtraTreesClassifier(n_estimators=120, max_depth=None, min_samples_split=1) #.489

    #Simple K-Fold cross validation. 5 folds.
    cv = cross_validation.KFold(len(train), k=5, indices=False)

    #iterate through the training and test cross validation segments and
    #run the classifier on each one, aggregating the results into a list
    results = []
    count = 0
    for traincv, testcv in cv:
        probas = cfr.fit(train[traincv], target[traincv]).predict_proba(train[testcv])
        result = logloss.llfun(target[testcv], [x[1] for x in probas])
        count += 1
        print('fold: %d, result: %f' % (count, result))
        results.append( result )

    #print out the mean of the cross-validated results
    print "Results: " + str( np.array(results).mean() )

    test = csv.read_data("../Data/test.csv")
    predicted_probs = cfr.predict_proba( [x[0:279] for x in test])
    predicted_probs = ["%f" % x[1] for x in predicted_probs]
    csv.write_delimited_file("../Submissions/rf_cv.csv",
                                predicted_probs)
Example #4
0
def main():
    #read in the training file
    train = csv_io.read_data("../data/train.csv")
    #set the training responses
    target = [x[0] for x in train]
    #set the training features
    train = [x[1:] for x in train]
    #read in the test file
    realtest = csv_io.read_data("../data/test.csv")

    # random forest code
    rf = RandomForestClassifier(n_estimators=150,
                                min_samples_split=2,
                                n_jobs=-1)
    # fit the training data
    print('fitting the model')
    rf.fit(train, target)
    # run model against test data
    predicted_probs = rf.predict_proba(realtest)

    predicted_probs = ["%f" % x[1] for x in predicted_probs]
    csv_io.write_delimited_file("random_forest_solution.csv", predicted_probs)

    print(
        'Random Forest Complete! You Rock! Submit random_forest_solution.csv to Kaggle'
    )
Example #5
0
def PreProcess3():

    trainBase = csv_io.read_data(
        "PreProcessData/training_PreProcess2_temp.csv", False)
    test = csv_io.read_data("PreProcessData/test_PreProcess2_temp.csv", False)

    target = [x[0] for x in trainBase]
    train = [x[1:] for x in trainBase]

    NumFeatures = 200

    #clf = RandomForestClassifier(n_estimators=100, n_jobs=1, criterion='gini',compute_importances=True)
    chi = chi2(train, target)
    print "Training"
    #clf.fit(train, target)

    chi = SelectKBest(chi2, k=NumFeatures).fit(train, target)
    print chi.get_support(indices=True)
    print chi.transform(X), np.array(train)[:, [0]]

    return

    trainNew = []
    testNew = []

    print "Computing Importances"
    importances = clf.feature_importances_
    #print importances
    importancesTemp = sorted(importances, reverse=True)
    print len(importancesTemp), "importances"

    if (len(importancesTemp) > NumFeatures):
        threshold = importancesTemp[NumFeatures]
        #print "Sorted and deleted importances"
        #print importancesTemp

        rowIndex = 0
        for row in train:
            newRow = []
            for impIndex, importance in enumerate(importances):
                if (impIndex == 0):
                    newRow.append(target[rowIndex])
                if (importance > threshold):
                    newRow.append(row[impIndex])
            trainNew.append(newRow)
            rowIndex += 1

        for row in test:
            newRow = []
            for impIndex, importance in enumerate(importances):
                if (importance > threshold):
                    #print impIndex, len(importances)
                    newRow.append(row[impIndex])
            testNew.append(newRow)

    csv_io.write_delimited_file("PreProcessData/training_PreProcess2_chi.csv",
                                trainNew)
    csv_io.write_delimited_file("PreProcessData/test_PreProcess2_chi.csv",
                                testNew)
Example #6
0
File: blend.py Project: mb16/Kaggle
def Blend():

	trainBase = csv_io.read_data("PreProcessData/training_PreProcess4.csv", skipFirstLine = False, split = "\t")
	weights = csv_io.read_data("PreProcessData/Weights.csv", skipFirstLine = False)
		
	SEED = 448
	#random.seed(SEED)
	#random.shuffle(trainBase)
	
	target = [x[0] for x in trainBase]
	
	dataset_blend_train, dataset_blend_test = stack.run_stack(SEED)
	clfs = [LinearRegression(fit_intercept=True, normalize=False, copy_X=True)
		]
	
	
	# clfs = [LogisticRegression(penalty='l2', dual=False, tol=0.0001, C=1.0, fit_intercept=True, intercept_scaling=1, class_weight=None),
			# LogisticRegression(penalty='l2', dual=False, tol=0.0001, C=0.5, fit_intercept=True, intercept_scaling=1, class_weight=None),
			# LogisticRegression(penalty='l2', dual=False, tol=0.0001, C=0.1, fit_intercept=True, intercept_scaling=1, class_weight=None),
			# LogisticRegression(penalty='l1', dual=False, tol=0.0001, C=1.0, fit_intercept=True, intercept_scaling=1, class_weight=None),
			# LogisticRegression(penalty='l1', dual=False, tol=0.0001, C=0.5, fit_intercept=True, intercept_scaling=1, class_weight=None),
			# LogisticRegression(penalty='l1', dual=False, tol=0.0001, C=0.1, fit_intercept=True, intercept_scaling=1, class_weight=None)]
	
	test = csv_io.read_data("PreProcessData/test_PreProcess4.csv", False)
	dataset_blend_test_set = np.zeros((len(test), len(clfs)))
	
	for ExecutionIndex, clf in enumerate(clfs):

		clf.fit(dataset_blend_train, target)
		submission = clf.predict(dataset_blend_test)
		
		submission = ["%f" % x for x in submission]
		now = datetime.datetime.now()
		csv_io.write_delimited_file("../Submissions/BlendSingle" + now.strftime("%Y%m%d%H%M%S") + ".csv", submission)	
		

		
		# attempt to score the training set to predict score for blend...
		probSum = 0.0
		weightSum = 0
		
		trainPrediction = clf.predict(dataset_blend_train)
		for i in range(0, len(trainPrediction)):
			probX = trainPrediction[i]
			

			probSum += weights[i][0] * math.fabs(target[i] - probX)
			weightSum += weights[i][0]
			#probSum += int(target[i])*log(probX)+(1-int(target[i]))*log(1-probX)
			 
		print "Train Score: ", (probSum/weightSum)
	
		dataset_blend_test_set[:, ExecutionIndex] = submission
	
	
	
	csv_io.write_delimited_file_single("../Submissions/FinalBlend_" + now.strftime("%Y%m%d%H%M%S") + ".csv", dataset_blend_test_set.mean(1))	
Example #7
0
def Blend():

    trainBase = csv_io.read_data("PreProcessData/training_PreProcess4.csv",
                                 skipFirstLine=False,
                                 split="\t")
    weights = csv_io.read_data("PreProcessData/Weights.csv",
                               skipFirstLine=False)

    SEED = 448
    #random.seed(SEED)
    #random.shuffle(trainBase)

    target = [x[0] for x in trainBase]

    dataset_blend_train, dataset_blend_test = stack.run_stack(SEED)
    clfs = [LinearRegression(fit_intercept=True, normalize=False, copy_X=True)]

    # clfs = [LogisticRegression(penalty='l2', dual=False, tol=0.0001, C=1.0, fit_intercept=True, intercept_scaling=1, class_weight=None),
    # LogisticRegression(penalty='l2', dual=False, tol=0.0001, C=0.5, fit_intercept=True, intercept_scaling=1, class_weight=None),
    # LogisticRegression(penalty='l2', dual=False, tol=0.0001, C=0.1, fit_intercept=True, intercept_scaling=1, class_weight=None),
    # LogisticRegression(penalty='l1', dual=False, tol=0.0001, C=1.0, fit_intercept=True, intercept_scaling=1, class_weight=None),
    # LogisticRegression(penalty='l1', dual=False, tol=0.0001, C=0.5, fit_intercept=True, intercept_scaling=1, class_weight=None),
    # LogisticRegression(penalty='l1', dual=False, tol=0.0001, C=0.1, fit_intercept=True, intercept_scaling=1, class_weight=None)]

    test = csv_io.read_data("PreProcessData/test_PreProcess4.csv", False)
    dataset_blend_test_set = np.zeros((len(test), len(clfs)))

    for ExecutionIndex, clf in enumerate(clfs):

        clf.fit(dataset_blend_train, target)
        submission = clf.predict(dataset_blend_test)

        submission = ["%f" % x for x in submission]
        now = datetime.datetime.now()
        csv_io.write_delimited_file(
            "../Submissions/BlendSingle" + now.strftime("%Y%m%d%H%M%S") +
            ".csv", submission)

        # attempt to score the training set to predict score for blend...
        probSum = 0.0
        weightSum = 0

        trainPrediction = clf.predict(dataset_blend_train)
        for i in range(0, len(trainPrediction)):
            probX = trainPrediction[i]

            probSum += weights[i][0] * math.fabs(target[i] - probX)
            weightSum += weights[i][0]
            #probSum += int(target[i])*log(probX)+(1-int(target[i]))*log(1-probX)

        print "Train Score: ", (probSum / weightSum)

        dataset_blend_test_set[:, ExecutionIndex] = submission

    csv_io.write_delimited_file_single(
        "../Submissions/FinalBlend_" + now.strftime("%Y%m%d%H%M%S") + ".csv",
        dataset_blend_test_set.mean(1))
Example #8
0
def PreProcess3():
    filename = "stack201208301510"

    data = csv_io.read_data("../Submissions/" + filename + ".csv", False)
    data = SimpleScale(
        data, floor=0.05,
        ceiling=0.90)  # took 0.389 score an lowered to 0.40, not good...

    csv_io.write_delimited_file(
        "../Submissions/" + filename + "_SimpleScale.csv", data)
Example #9
0
def main():
    train = csv_io.read_data("../Data/train.csv")
    target = [x[0] for x in train]
    train = [x[1:] for x in train]
    test = csv_io.read_data("../Data/test.csv")

    rf = RandomForestClassifier(n_estimators=100, min_split=2)
    rf.fit(train, target)
    predicted_probs = rf.predict_proba(test)
    predicted_probs = ["%f" % x[1] for x in predicted_probs]
    csv_io.write_delimited_file("../Submissions/rf_benchmark.csv",
                                predicted_probs)
Example #10
0
def main():
    train = csv_io.read_data("../data/train.csv")
    target = [x[0] for x in train]
    train = [x[1:] for x in train]
    test = csv_io.read_data("../data/test.csv")

    svc = svm.SVC(probability=True)
    svc.fit(train, target)
    predicted_probs = svc.predict_proba(test)
    predicted_probs = ["%f" % x[1] for x in predicted_probs]
    csv_io.write_delimited_file("../submissions/svm_benchmark.csv",
                                predicted_probs)
def main():
    train = csv_io.read_data("../Data/train.csv")
    targets = [int(x[0]) for x in train]
    num_targets = len(targets)
    num_ones = np.sum(targets)
    optimized_value = float(num_ones) / num_targets

    test = csv_io.read_data("../Data/test.csv")
    
    predicted_probs = ["%f" % optimized_value for x in test] 
    csv_io.write_delimited_file("../Submissions/optimized_value_benchmark.csv",
                                predicted_probs)
Example #12
0
def main():
    train = csv_io.read_data("../Data/train.csv")
    target = [x[0] for x in train]
    train = [x[1:] for x in train]
    test = csv_io.read_data("../Data/test.csv")

    rf = RandomForestClassifier(n_estimators=100, min_split=2)
    rf.fit(train, target)
    predicted_probs = rf.predict_proba(test)
    predicted_probs = ["%f" % x[1] for x in predicted_probs]
    csv_io.write_delimited_file("../Submissions/rf_benchmark.csv",
                                predicted_probs)
Example #13
0
def main():
    train = csv_io.read_data("../Data/train.csv")
    target = [x[0] for x in train]
    train = [x[1:] for x in train]
    test = csv_io.read_data("../Data/test.csv")

    svc = svm.SVC(probability=True)
    svc.fit(train, target)
    predicted_probs = svc.predict_proba(test)
    predicted_probs = ["%f" % x[1] for x in predicted_probs]
    csv_io.write_delimited_file("../Submissions/svm_benchmark.csv",
                                predicted_probs)
Example #14
0
def main():
    training, target = csv_io.read_data("../Data/train.csv")
    test, throwaway = csv_io.read_data("../Data/test.csv")
    
    n_test = len(test)
    n_target = len(set(target))

    predicted_probs = [[0.001 for x in range(n_target)] 
                       for y in range(n_test)]
    predicted_probs = [["%f" % x for x in y] for y in predicted_probs]
    csv_io.write_delimited_file("../Submissions/uniform_benchmark.csv",
                                predicted_probs)
Example #15
0
def PreProcessRun(dataSet):
    print
    print "DataSet: ", dataSet

    print "Loading Data"
    data = csv_io.read_data("PreProcessData/" + dataSet + "_PreProcess.csv",
                            split="\t",
                            skipFirstLine=False)
    print dataSet, "Size: ", len(data[0])

    if (dataSet == "training"):  # do only once.
        shutil.copy2("PreProcessData/DataClassList.csv",
                     "PreProcessData/DataClassList1.csv")

    DataClassList = csv_io.read_data("PreProcessData/DataClassList1.csv",
                                     False)

    offset = 0
    offset2 = 0
    if (dataSet == "test"):
        offset = 1
        offset2 = -1

    print DataClassList

    print "Appending New Data"
    firstTime = True
    for row in data:

        text = ""

        val = row[136 + offset2] / row[139 + offset2]
        row.append(val)
        if (firstTime and dataSet == "training"):  # do only once.
            text = DataClassList[135 + offset][0] + "_DIV_" + DataClassList[
                139 + offset][0]
            csv_io.write_delimited_file("PreProcessData/DataClassList1.csv",
                                        [text],
                                        filemode="a")
        if (firstTime):
            print row[136 + offset2], row[139 + offset2], val, text

        firstTime = False

    csv_io.write_delimited_file("PreProcessData/" + dataSet +
                                "_PreProcess1.csv",
                                data,
                                delimiter="\t")

    print "Done."
Example #16
0
def main():
    training, target = csv_io.read_data("../Data/train.csv")
    training = [x[1:] for x in training]
    target = [float(x) for x in target]
    test, throwaway = csv_io.read_data("../Data/test.csv")
    test = [x[1:] for x in test]

    svc = svm.SVC(probability=True)
    svc.fit(training, target)
    predicted_probs = svc.predict_proba(test)
    predicted_probs = [[min(max(x,0.001),0.999) for x in y] 
                       for y in predicted_probs]
    predicted_probs = [["%f" % x for x in y] for y in predicted_probs]
    csv_io.write_delimited_file("../Submissions/svm_benchmark.csv",
                                predicted_probs)
Example #17
0
def main():
    training, target = csv_io.read_data("../Data/train.csv")
    training = [x[1:] for x in training]
    target = [float(x) for x in target]
    test, throwaway = csv_io.read_data("../Data/test.csv")
    test = [x[1:] for x in test]

    rf = RandomForestClassifier(n_estimators=100, min_split=2)
    rf.fit(training, target)
    predicted_probs = rf.predict_proba(test)
    predicted_probs = [[min(max(x,0.001),0.999) for x in y] 
                       for y in predicted_probs]
    predicted_probs = [["%f" % x for x in y] for y in predicted_probs]
    csv_io.write_delimited_file("../Submissions/rf_benchmark.csv",
                                predicted_probs)
Example #18
0
def main():
    training, target = csv_io.read_data("../Data/train.csv")
    training = [x[1:] for x in training]
    target = [float(x) for x in target]
    test, throwaway = csv_io.read_data("../Data/test.csv")
    test = [x[1:] for x in test]

    rf = RandomForestClassifier(n_estimators=100, min_split=2)
    rf.fit(training, target)
    predicted_probs = rf.predict_proba(test)
    predicted_probs = [[min(max(x, 0.001), 0.999) for x in y]
                       for y in predicted_probs]
    predicted_probs = [["%f" % x for x in y] for y in predicted_probs]
    csv_io.write_delimited_file("../Submissions/rf_benchmark.csv",
                                predicted_probs)
Example #19
0
def main():
    np.random.seed(42)
    #read in the training file
    modelone = np.asarray(csv_io.read_data("results/gmm_pca12_6.csv",header=True))
    modeltwo = np.asarray(csv_io.read_data("results/random_forest_solution-12pca-4.csv",header=True))
    modelthree= np.asarray(csv_io.read_data("results/svm_pca12_5.csv",header=True))
    bagmodel = np.column_stack((modelone[:,1], modeltwo[:,1], modelthree[:,1]))
    bagsum = bagmodel.sum(axis=1)
    predicted_class = np.zeros(bagsum.shape)
    predicted_class[bagsum >=2] = 1
    
    predicted_class = ["%d,%d" % (i+1, predicted_class[i]) for i in range(len(predicted_class))]
    print predicted_class[0:9]
    print(len(predicted_class))
    csv_io.write_delimited_file("results/bagging_solution_7.csv", predicted_class, header=['Id', 'Solution'])
Example #20
0
def main():
    training, target = csv_io.read_data("../Data/train.csv")
    training = [x[1:] for x in training]
    target = [float(x) for x in target]
    test, throwaway = csv_io.read_data("../Data/test.csv")
    test = [x[1:] for x in test]

    svc = svm.SVC(probability=True)
    svc.fit(training, target)
    predicted_probs = svc.predict_proba(test)
    predicted_probs = [[min(max(x, 0.001), 0.999) for x in y]
                       for y in predicted_probs]
    predicted_probs = [["%f" % x for x in y] for y in predicted_probs]
    csv_io.write_delimited_file("../Submissions/svm_benchmark.csv",
                                predicted_probs)
Example #21
0
def main(strat = False, visualization = False):
    #read in the training file
    X = csv_io.read_data("data/train.csv")
    target = ravel(csv_io.read_data("data/trainLabels.csv"))
    realtest = csv_io.read_data("data/test.csv")
    print len(realtest)

    #pca
    pca = PCA(n_components=num_pca)
    pca.fit(X)
    train = pca.transform(X)
    test_transformed = pca.transform(realtest)
    print('performed pca')

    # random forest code
    clf = svm.SVC()
    if strat:
        print "stratified cross-validation on shuffled data"    
        # adapted from http://stackoverflow.com/a/8281241
        crossval = []
        for i in range(strat):
            X, y = shuffle(train, target, random_state=i)
            skf = StratifiedKFold(y, 10)
            crossval.append([min(cross_val_score(clf, X, y, cv=skf)), np.median(cross_val_score(clf, X, y, cv=skf)), max(cross_val_score(clf, X, y, cv=skf))]) 
        print crossval

    if visualization:
        print "preparing visualization"
        data_train, data_test, target_train, target_test = train_test_split(train, target, test_size=0.20, random_state=42)
        plot1 = drawLearningCurve(clf, data_train, target_train, data_test, target_test)
        pp = PdfPages('figures/learningCurve.pdf')
        pp.savefig(plot1)
        pp.close()

    print('fitting the model')
    clf.fit(train, target)
    # run model against test data
    predicted_class = clf.predict(test_transformed)
    print predicted_class[0:9]
    print(len(predicted_class))

    print('Writing output')
    predicted_class = ["%d,%d" % (i+1, predicted_class[i]) for i in range(len(predicted_class))]
    print predicted_class[0:9]
    print(len(predicted_class))
    csv_io.write_delimited_file("results/svm_pca12_5.csv", predicted_class, header=['Id', 'Solution'])

    print ('Finished. Exiting.')
Example #22
0
def PreProcessRun(dataSet):
	print
	print "DataSet: ", dataSet
	
	print "Loading Data"
	data = csv_io.read_data("PreProcessData/" + dataSet + "_PreProcess1.csv", split="\t" ,skipFirstLine = False)
	print dataSet, "Size: ", len(data[0])
	
	if ( os.path.exists("PreProcessData/" + dataSet + "_PreProcess2.csv") ):
		os.remove("PreProcessData/" + dataSet + "_PreProcess2.csv")
	
	SkipArr = [0,2,4,172]

	
	DataClassList = csv_io.read_data("PreProcessData/DataClassList1.csv", False)
	DataClassListNew = []
	
	firstTime = True
	for index, item in enumerate(data):
	
		rowNew = []
		#print item
		
		for index, val in enumerate(item):
			if dataSet == "training" and (index - 1) in SkipArr:
				continue
			elif dataSet == "test" and index in SkipArr:
				continue
			rowNew.append(val)
		
			#print val
			if dataSet == "test" and firstTime == True:
				print DataClassList[index]
				DataClassListNew.append(DataClassList[index])
				
		csv_io.write_delimited_file("PreProcessData/" + dataSet + "_PreProcess2.csv", [copy.deepcopy(rowNew)], filemode="a", delimiter="\t")

		firstTime = False

	if dataSet == "test":
		csv_io.write_delimited_file("PreProcessData/DataClassList2.csv", DataClassListNew)

	
	print "Done."		
Example #23
0
File: stack.py Project: mb16/Kaggle
def main():

    et = csv_io.read_data("../Submissions/et_stack_avg_benchmark.csv", False)
    rbf = csv_io.read_data("../Submissions/svm-rbf-bootstrap-stack_meanSpan_benchmark.csv", False)
    poly = csv_io.read_data("../Submissions/svm-poly-bootstrap-stack_meanSpan_benchmark.csv", False)
    rf = csv_io.read_data("../Submissions/rf2_avg_benchmark.csv", False)
    gb = csv_io.read_data("../Submissions/gb_avg_benchmark.csv", False)

    stack = []
    stack.append(et)
    stack.append(rbf)
    stack.append(poly)
    stack.append(rf)
    stack.append(gb)	
	
    spanDistance = 3
    finalList = []
    for p in range(0, len(stack[0])):
        temp_list =[]	
        for q in range(0, len(stack)):		
		    temp_list.append( stack[q][p][0]) 

        avg = sum(temp_list)/float(len(stack))	

        if ( avg < 0.5 ):
            finalList.append(0.2) 
            #finalList.append(min(temp_list)) 
            print p, q, temp_list, avg, min(temp_list)
        else:		
            finalList.append(0.80) 
		    #finalList.append(max(temp_list)) 
            print p, q, temp_list, avg, max(temp_list)
			
        #finalList.append( meanSpan(temp_list, spanDistance) )
        #print p, q, temp_list, meanSpan(temp_list, spanDistance)
  			
		
    finalStack = ["%f" % x for x in finalList]
    csv_io.write_delimited_file("../Submissions/stack.csv", finalStack)	
	
	

    var = raw_input("Enter to terminate.")
Example #24
0
def main():

    et = csv_io.read_data("../Submissions/et_stack_avg_benchmark.csv", False)
    rbf = csv_io.read_data(
        "../Submissions/svm-rbf-bootstrap-stack_meanSpan_benchmark.csv", False)
    poly = csv_io.read_data(
        "../Submissions/svm-poly-bootstrap-stack_meanSpan_benchmark.csv",
        False)
    rf = csv_io.read_data("../Submissions/rf2_avg_benchmark.csv", False)
    gb = csv_io.read_data("../Submissions/gb_avg_benchmark.csv", False)

    stack = []
    stack.append(et)
    stack.append(rbf)
    stack.append(poly)
    stack.append(rf)
    stack.append(gb)

    spanDistance = 3
    finalList = []
    for p in range(0, len(stack[0])):
        temp_list = []
        for q in range(0, len(stack)):
            temp_list.append(stack[q][p][0])

        avg = sum(temp_list) / float(len(stack))

        if (avg < 0.5):
            finalList.append(0.2)
            #finalList.append(min(temp_list))
            print p, q, temp_list, avg, min(temp_list)
        else:
            finalList.append(0.80)
            #finalList.append(max(temp_list))
            print p, q, temp_list, avg, max(temp_list)

        #finalList.append( meanSpan(temp_list, spanDistance) )
        #print p, q, temp_list, meanSpan(temp_list, spanDistance)

    finalStack = ["%f" % x for x in finalList]
    csv_io.write_delimited_file("../Submissions/stack.csv", finalStack)

    var = raw_input("Enter to terminate.")
Example #25
0
def main():
    #read in the training file
    train = csv_io.read_data("train.csv")
    #set the training responses
    target = [x[0] for x in train]
    #set the training features
    train = [x[1:] for x in train]
    #read in the test file
    realtest = csv_io.read_data("test.csv")

    # code for logistic regression
    lr = LogisticRegression()
    lr.fit(train, target)
    predicted_probs = lr.predict_proba(realtest)
    
    # write solutions to file
    predicted_probs = ["%f" % x[1] for x in predicted_probs]
    csv_io.write_delimited_file("log_solution.csv", predicted_probs)
    
    print ('Logistic Regression Complete! Submit log_solution.csv to Kaggle')
Example #26
0
def main():
    #read in the training file
    train = csv_io.read_data("train.csv")
    #set the training responses
    target = [x[0] for x in train]
    #set the training features
    train = [x[1:] for x in train]
    #read in the test file
    realtest = csv_io.read_data("test.csv")

    # code for logistic regression
    lr = LogisticRegression()
    lr.fit(train, target)
    predicted_probs = lr.predict_proba(realtest)
    
    # write solutions to file
    predicted_probs = ["%f" % x[1] for x in predicted_probs]
    csv_io.write_delimited_file("log_solution.csv", predicted_probs)
    
    print ('Logistic Regression Complete! Submit log_solution.csv to Kaggle')
Example #27
0
def Analyze1():

    Threshold = 4.0
    targetFile = "Target_Stack_20121017110223_3.06649134025_GradientBoos.csv"

    trainBase = csv_io.read_data("PreProcessData/training_PreProcess3.csv",
                                 skipFirstLine=False,
                                 split="\t")
    shutil.copy2("PreProcessData/test_PreProcess3.csv",
                 "PreProcessData/test_PreProcess8.csv")
    shutil.copy2("PreProcessData/DataClassList3.csv",
                 "PreProcessData/DataClassList8.csv")
    weights = csv_io.read_data("PreProcessData/Weights.csv",
                               skipFirstLine=False)

    target = [x[0] for x in trainBase]

    print "Loading Data"
    trainNew = []

    probSum = 0.0
    weightSum = 0

    trn = csv_io.read_data("../predictions/" + targetFile,
                           split=",",
                           skipFirstLine=False)
    for row, datum in enumerate(trn):

        if (abs(datum[0] - target[row]) > Threshold):
            print datum[0], target[row]
            trainNew.append(trainBase[row])

            probSum += weights[row][0] * math.fabs(target[row] - datum[0])
            weightSum += weights[row][0]

    print "Train Score: ", (probSum / weightSum)
    print len(trainNew)
    csv_io.write_delimited_file("PreProcessData/training_PreProcess8" + ".csv",
                                trainNew,
                                delimiter="\t")
Example #28
0
def main():
    # read in the training file
    train = csv_io.read_data("train.csv")
    # set the training responses
    target = [x[0] for x in train]
    # set the training features
    train = [x[1:] for x in train]
    # read in the test file
    realtest = csv_io.read_data("test.csv")

    # random forest code
    rf = RandomForestClassifier(n_estimators=150, min_samples_split=2, n_jobs=-1)
    # fit the training data
    print("fitting the model")
    rf.fit(train, target)
    # run model against test data
    predicted_probs = rf.predict_proba(realtest)

    predicted_probs = ["%f" % x[1] for x in predicted_probs]
    csv_io.write_delimited_file("random_forest_solution.csv", predicted_probs)

    print("Random Forest Complete! You Rock! Submit random_forest_solution.csv to Kaggle")
def main():
    #read in the training file
    train = csv_io.read_data("train.csv")
    
    #set the training responses
    target = [x[0] for x in train]
    
    #set the training features
    train = [x[1,3,4,5,6] for x in train]
    
    #read in the test file
    realtest = csv_io.read_data("test.csv")

    # random forest code
    rf = RandomForestClassifier(n_estimators=10)
    # fit the training data
    print('fitting the model')
    rf.fit(train, target)
    # run model against test data
    predicted_probs = rf.predict_proba(realtest)
        
    predicted_probs = ["%f" % x[1] for x in predicted_probs]
    csv_io.write_delimited_file("random_forest_solution.csv", predicted_probs)
Example #30
0
def main():
    #read in the training file
    train = csv_io.read_data("train.csv")

    #set the training responses
    target = [x[0] for x in train]

    #set the training features
    train = [x[1, 3, 4, 5, 6] for x in train]

    #read in the test file
    realtest = csv_io.read_data("test.csv")

    # random forest code
    rf = RandomForestClassifier(n_estimators=10)
    # fit the training data
    print('fitting the model')
    rf.fit(train, target)
    # run model against test data
    predicted_probs = rf.predict_proba(realtest)

    predicted_probs = ["%f" % x[1] for x in predicted_probs]
    csv_io.write_delimited_file("random_forest_solution.csv", predicted_probs)
Example #31
0
def main():

    startCol = 0
    endCol = 1775  # max = 1775

    train = csv_io.read_data("../Data/train.csv")
    target = [x[0] for x in train][1:3000]
    targetTest = [x[0] for x in train][3001:]
    trainTest = [x[startCol + 1:endCol + 1] for x in train][3001:]
    test = csv_io.read_data("../Data/test.csv")
    test = [x[startCol:endCol] for x in test]

    train = [x[startCol + 1:endCol + 1] for x in train][1:3000]

    fo = open("knn_stats.txt", "a+")

    #n_neighbors=15, weights='distance' return 0.65
    #n_neighbors=3, weights='distance' 0.60
    rf = neighbors.KNeighborsClassifier(n_neighbors=3,
                                        weights='distance',
                                        algorithm='brute',
                                        leaf_size=100,
                                        warn_on_equidistant=True,
                                        p=2)  # 'distance'

    rf.fit(train, target)
    prob = rf.predict(trainTest)  # changed from test

    result = 100
    probSum = 0
    for i in range(0, len(prob)):
        probX = prob[i]  # [1]
        if (probX > 0.9):
            probX = 0.9
        if (probX < 0.1):
            probX = 0.1
        print i, probSum, probX, target[i]
        print target[i] * log(probX), (1 - target[i]) * log(1 - probX)
        probSum += targetTest[i] * log(probX) + (
            1 - targetTest[i]) * log(1 - probX)

        #print probSum
        #print len(prob)
        #print "C: ", 10**C, " gamma: " ,2**g
        print -probSum / len(prob)

    if (-probSum / len(prob) < result):
        result = -probSum / len(prob)
        predicted_probs = rf.predict(test)  # was test
        predicted_probs = ["%f" % x for x in predicted_probs]
        csv_io.write_delimited_file("../Submissions/knn.csv", predicted_probs)
        print "Generated Data!!"

    #fo.write(str(5) + str(5)+ str(5));

    fo.close()

    #csv_io.write_delimited_file("../Submissions/rf_benchmark_test2.csv", predicted_probs)

    #predicted_probs = rf.predict_proba(train) # changed from test

    #predicted_probs = ["%f" % x[1] for x in predicted_probs]
    #predicted_probs = rf.predict(train) # changed from test
    #predicted_probs = ["%f" % x for x in predicted_probs]

    #csv_io.write_delimited_file("../Submissions/rf_benchmark_train2.csv", predicted_probs)

    var = raw_input("Enter to terminate.")
Example #32
0
def PreProcess4(N_Features):

    trainBase = csv_io.read_data("PreProcessData/training_PreProcess3.csv",
                                 skipFirstLine=False,
                                 split="\t")
    test = csv_io.read_data("PreProcessData/test_PreProcess3.csv",
                            skipFirstLine=False,
                            split="\t")
    shutil.copy2("PreProcessData/DataClassList3.csv",
                 "PreProcessData/DataClassList4.csv")

    target = [x[0] for x in trainBase]
    train = [x[1:] for x in trainBase]

    DataClassList = csv_io.read_data("PreProcessData/DataClassList4.csv",
                                     False)

    print "Data len: ", len(train[0])
    print "DataClassList len: ", len(DataClassList)
    #return

    # this seems about optimal, but has not been tuned on latest improvements.
    NumFeatures = N_Features
    # NOTE going from 30 to 20 features on KNN5 set has almost no effect.  Down to 15 is significant loss.
    # for GBM at 6 and 400 30 is 3.01 and 30 3.05.

    print "Scaling"
    term = 5000  #  scaler has memory errors between 5000 and 10000
    #term = len(trainBase)
    targetPre = [x[0] for x in trainBase][0:term]
    trainPre = [x[1:] for x in trainBase][0:term]
    #testPre = [x[0:] for x in test][0:term]
    targetPre = target[0:term]
    #print trainPre[term - 1]
    scaler = preprocessing.Scaler().fit(trainPre)
    trainScaled = scaler.transform(trainPre)
    #testScaled = scaler.transform(testPre)

    #clf = RandomForestRegressor(n_estimators=25, n_jobs=1,compute_importances=True)
    clf = GradientBoostingRegressor(loss='ls',
                                    learn_rate=0.05,
                                    subsample=0.5,
                                    max_depth=6,
                                    n_estimators=400,
                                    random_state=166,
                                    min_samples_leaf=30)

    print "Training"

    clf.fit(trainScaled, targetPre)

    trainNew = []
    testNew = []

    print "Computing Importances"
    importances = clf.feature_importances_

    DataClassListNew = []
    for DataIndex, DataClass in enumerate(DataClassList):
        print DataClass[0], importances[DataIndex]
        DataClassListNew.append([DataClass[0], importances[DataIndex]])

    csv_io.write_delimited_file(
        "PreProcessData/DataClassList_Importances_" + str(NumFeatures) +
        ".csv", DataClassListNew)

    DataClassListNew_temp = sorted(DataClassListNew,
                                   key=operator.itemgetter(1),
                                   reverse=True)
    csv_io.write_delimited_file(
        "PreProcessData/DataClassList_Importances_sorted_" + str(NumFeatures) +
        ".csv", DataClassListNew_temp)

    importancesTemp = sorted(importances, reverse=True)
    print len(importancesTemp), "importances"

    if (len(importancesTemp) > NumFeatures):
        threshold = importancesTemp[NumFeatures]

        print "Importance threshold: ", threshold

        rowIndex = 0
        for row in train:
            newRow = []
            for impIndex, importance in enumerate(importances):
                if (impIndex == 0):
                    newRow.append(target[rowIndex])
                if (importance > threshold):
                    newRow.append(row[impIndex])
            trainNew.append(newRow)
            rowIndex += 1

        for row in test:
            newRow = []
            for impIndex, importance in enumerate(importances):
                if (importance > threshold):
                    newRow.append(row[impIndex])
            testNew.append(newRow)

    csv_io.write_delimited_file("PreProcessData/training_PreProcess4_" +
                                str(NumFeatures) + ".csv",
                                trainNew,
                                delimiter="\t")
    csv_io.write_delimited_file("PreProcessData/test_PreProcess4_" +
                                str(NumFeatures) + ".csv",
                                testNew,
                                delimiter="\t")
Example #33
0
def run_stack(SEED, col, alpha):

    model = "Lasso"
    lossThreshold = 0.46

    trainBaseTarget = pd.read_csv('../preprocess/pre_shuffled_target_' + col +
                                  '.csv')
    trainBaseOrig = pd.read_csv('../models/' + model + dset + '_train_' + col +
                                '.csv')
    testOrig = pd.read_csv('../models/' + model + dset + '_test_' + col +
                           '.csv')

    targetBase = np.nan_to_num(np.array(trainBaseTarget))

    #print(trainBase.columns)
    trainBaseID = trainBaseOrig['PIDN']
    testID = testOrig['PIDN']

    avg = 0
    NumFolds = 5

    # ----------------------

    stackFiles = []
    for filename in os.listdir("../predictions"):
        parts = filename.split("_")
        if (filename[0:5] == "Stack" and "Lasso" in filename and
                float(parts[2]) < lossThreshold):  # and "Lasso" in filename

            stackFiles.append(filename)

    trainBase = np.zeros((len(trainBaseOrig), len(stackFiles)))
    test = np.zeros((len(testOrig), len(stackFiles)))

    # first col is PIDN, after that we have 'Ca','P','pH','SOC','Sand', so we need to add 1
    if col == 'Ca':
        targetCol = 1
    elif col == 'P':
        targetCol = 2
    elif col == 'pH':
        targetCol = 3
    elif col == 'SOC':
        targetCol = 4
    elif col == 'Sand':
        targetCol = 5

    print("Loading Data")
    for fileNum, file in enumerate(stackFiles):
        print(file)
        trn = csv_io.read_data(
            "../predictions/Target_" + file, split=",",
            skipFirstLine=True)  # skip first because of header.
        for row, datum in enumerate(trn):
            trainBase[row, fileNum] = datum[targetCol]

        tst = csv_io.read_data(
            "../predictions/" + file, split=",",
            skipFirstLine=True)  # skip first because of header.
        for row, datum in enumerate(tst):
            test[row, fileNum] = datum[targetCol]

    np.savetxt('temp/dataset_blend_train.txt', trainBase)
    np.savetxt('temp/dataset_blend_test.txt', test)
    print("Num file processed: " + " " + str(len(stackFiles)) + " " +
          "Threshold: " + str(lossThreshold))

    print("Starting Scale")

    allVals = np.vstack((trainBase, test))

    scl = StandardScaler(copy=True, with_mean=True, with_std=True)
    scl.fit(allVals)  # should fit on the combined sets.

    trainBase = scl.transform(trainBase)
    test = scl.transform(test)

    print("Starting Blend")

    # --------------------------------

    clfs = [
        Lasso(alpha=alpha),
    ]

    print("Data size: " + str(len(trainBase)) + " " + str(len(test)))
    dataset_blend_train = np.zeros((len(trainBase), len(clfs)))
    dataset_blend_test = np.zeros((len(test), len(clfs)))

    averageSet = []

    print("Begin Training")

    lenTrainBase = len(trainBase)
    lenTest = len(test)

    gc.collect()

    for ExecutionIndex, clf in enumerate(clfs):
        print(clf)
        avg = 0

        dataset_blend_test_set = np.zeros((lenTest, NumFolds))

        foldCount = 0

        Folds = cross_validation.KFold(lenTrainBase,
                                       n_folds=NumFolds,
                                       indices=True)

        for train_index, test_index in Folds:

            print()
            print("Iteration: " + str(foldCount))

            now = datetime.datetime.now()
            print(now.strftime("%Y/%m/%d %H:%M:%S"))

            target = [targetBase[i] for i in train_index]
            train = [trainBase[i] for i in train_index]

            targetTest = [targetBase[i] for i in test_index]
            trainTest = [trainBase[i] for i in test_index]

            #print "LEN: ", len(train), len(target)

            target = np.array(np.reshape(target, (-1, 1)))

            targetTest = np.array(np.reshape(targetTest, (-1, 1)))

            #clf.fit(train, target, sample_weight = weight
            clf.fit(train, target)
            predicted = clf.predict(trainTest)
            #print(predicted[:,0])
            #print(predicted)
            dataset_blend_train[
                test_index,
                ExecutionIndex] = predicted  #[:,0] #needed for Ridge

            #print(targetTest.shape)
            #print(prpredictedob.shape)
            #print(weightTest.shape)

            print(str(math.sqrt(mean_squared_error(targetTest, predicted))))
            avg += math.sqrt(mean_squared_error(targetTest,
                                                predicted)) / NumFolds

            predicted = clf.predict(test)
            dataset_blend_test_set[:, foldCount] = predicted  #[:,0]

            foldCount = foldCount + 1

            #break

        averageSet.extend([avg])

        dataset_blend_test[:, ExecutionIndex] = dataset_blend_test_set.mean(1)

        now = datetime.datetime.now()
        #print dataset_blend_test_set.mean(1)
        #csv_io.write_delimited_file_single("../predictions/Stack_" + now.strftime("%Y%m%d%H%M%S") + "_" + str(avg) + "_" + str(clf)[:12] + ".csv", dataset_blend_test_set.mean(1))

        submission = pd.DataFrame(np.zeros((len(testID), 2)),
                                  columns=['PIDN', col])
        submission[col] = dataset_blend_test[:, ExecutionIndex]
        submission['PIDN'] = testID
        submission.to_csv("../submission/temp/Blend_" + dset + "_" +
                          now.strftime("%Y%m%d%H%M%S") + "_" + str(avg) + "_" +
                          str(clf)[:12] + "_" + col + ".csv",
                          index=False)

        #csv_io.write_delimited_file_single("../predictions/Target_Stack_" + now.strftime("%Y%m%d%H%M%S") + "_" + str(avg) + "_" + str(clf)[:12] + ".csv", dataset_blend_train[:,ExecutionIndex] )

        submission = pd.DataFrame(np.zeros((len(trainBaseID), 2)),
                                  columns=['PIDN', col])
        submission[col] = dataset_blend_train[:, ExecutionIndex]
        submission['PIDN'] = trainBaseID
        submission.to_csv("../submission/temp/Target_Blend_" + dset + "_" +
                          now.strftime("%Y%m%d%H%M%S") + "_" + str(avg) + "_" +
                          str(clf)[:12] + "_" + col + ".csv",
                          index=False)

        csv_io.write_delimited_file("../log/partial/RunLogBlend.csv", [
            now.strftime("%Y %m %d %H %M %S"), "AVG.",
            str(avg),
            str(clf), "Folds:",
            str(NumFolds), "Model", "Blend", "Stacks", stackFiles
        ],
                                    filemode="a",
                                    delimiter=",")

        print("------------------------Average: " + str(avg))

        #np.savetxt('temp/dataset_blend_train.txt', dataset_blend_train)

    return dataset_blend_train, dataset_blend_test, averageSet, clfs, NumFolds, model
Example #34
0
def run_stack(SEED):

	model = "" 

	print "Running Stack."

	
	avg = 0
	NumFolds = 5 # 5 is good, but 10 yeilds a better mean since outliers are less significant. 

	targetX = csv_io.read_data("target.csv", skipFirstLine = False, split = ",")
	
	trainBase = csv_io.read_data("train1.csv", skipFirstLine = False, split = ",")	
	#test = csv_io_np.read_data("test1.csv", skipFirstLine = False, split = ",")

	trainBase = trainBase[0:5000]
	targetX = targetX[0:5000]
	
	train_saleID = csv_io.read_data("train_salesID.csv", skipFirstLine = False, split = ",")
	test_salesID = csv_io.read_data("test_salesID.csv", skipFirstLine = False, split = ",")
	

	predicted_list = []
	bootstrapLists = []


	clfs = [
		GradientBoostingRegressor(loss='ls', learn_rate=0.05, subsample=0.5, max_depth=6, n_estimators=300, random_state=166, min_samples_leaf=1)		
	]		
	#GradientBoostingRegressor(loss='ls', learn_rate=0.05, subsample=0.5, max_depth=6, n_estimators=300, random_state=166, min_samples_leaf=1)	
	#GradientBoostingRegressor(loss='lad', learn_rate=0.05, subsample=0.5, max_depth=6, n_estimators=3000, random_state=166, min_samples_leaf=1),	
	#GradientBoostingRegressor(loss='huber', learn_rate=0.05, subsample=0.5, max_depth=6, n_estimators=3000, random_state=166, min_samples_leaf=1),	
	

	
	print "Data size: ", len(trainBase) , 11573 # len(test)
	dataset_blend_train = np.zeros((len(trainBase), len(clfs)))
	#dataset_blend_test = np.zeros((len(test), len(clfs)))
	dataset_blend_test = np.zeros(11573, len(clfs))	
	
	#targetPre = target #[0:5000]
	#testScaled = test
	#trainScaled = trainBase #[0:5000]

	#targetPre = target #[0:5000]
	#testScaled = test
	#trainScaled = trainBase #[0:5000]
	
	
	print "Begin Training"

	lenTrainBase = len(trainBase)
	#lenTrainBase = len(trainBase[0:5000])


	lenTest = 11573
	#lenTest = len(test)

	
	
	gc.collect()
	
	for ExecutionIndex, clf in enumerate(clfs):
		print clf
		avg = 0
	
		predicted_list = []
			
		dataset_blend_test_set = np.zeros((lenTest, NumFolds))

		
		foldCount = 0

		
		#Stratified for classification...[trainBase[i][0] for i in range(len(trainBase))]
		Folds = cross_validation.KFold(lenTrainBase, k=NumFolds, indices=True)
			
		
		
		for train_index, test_index in Folds:

			target = [targetX[i] for i in train_index]
			train = [trainBase[i] for i in train_index]
			
			targetTest = [targetX[i] for i in test_index]	
			trainTest = [trainBase[i] for i in test_index]
			
			#target = [targetPre[i] for i in train_index]
			#train = [trainScaled[i] for i in train_index]
			
			#targetTest = [targetPre[i] for i in test_index]	
			#trainTest = [trainScaled[i] for i in test_index]	
	
			gc.collect()
			print
			print "Iteration: ", foldCount
			print "LEN: ", len(train), len(target)
			
			#print train[0]
			#print target[0]
			#return
			
			print "Start", datetime.datetime.now()
			clf.fit(train, target)
			prob = clf.predict(trainTest) 
			print "End  ", datetime.datetime.now()
			
			dataset_blend_train[test_index, ExecutionIndex] = prob

			gc.collect()

	
			probSum = 0
			weightSum = 0
			# totalOffByHalf = 0
			# totalPositive = 0
			# totalPositiveOffByHalf = 0
			# totalPositivePredictions = 0
			
			for i in range(0, len(prob)):
				probX = prob[i]
				probX = 31100.0
				print targetTest[i][0], probX
				probSum += math.pow(math.log10(targetTest[i][0]) - math.log10(probX), 2)
				
				#probSum += weights[test_index[i]][0] * math.fabs(targetTest[i] - probX)
				#weightSum += weights[test_index[i]][0] 
				
				
				#print "Weight", weights[test_index[i]][0], "Index: ",i, "Test_Index: ",test_index[i] , "Actual: ", targetTest[i], "Predicted: ", probX
				
				# log loss cal
				#probSum += int(targetTest[i])*log(probX)+(1-int(targetTest[i]))*log(1-probX)
				# if ( math.fabs(probX - int(targetTest[i])) > 0.5 ):
					# totalOffByHalf = totalOffByHalf + 1		
			
				# if ( int(targetTest[i]) == 1 ):
					# totalPositive = totalPositive + 1
				# if ( int(targetTest[i]) == 1 and probX < 0.5):
					# totalPositiveOffByHalf = totalPositiveOffByHalf + 1
				# if (probX > 0.5):
					# totalPositivePredictions = totalPositivePredictions + 1			
			
			# print
			# print "Stats:"
			# print "Total Off By > 0.5 ", totalOffByHalf
			# print "Total Positive ", totalPositive
			# print "Total Positive Off By Half ", totalPositiveOffByHalf
			# print "Total Positive Predictions ", totalPositivePredictions
			#print -probSum/len(prob)
			print "Score: ", math.sqrt(probSum/len(prob))
 
			avg += 	math.sqrt(probSum/len(prob))/NumFolds

			gc.collect()
			
			fo = open("test1.csv", "r")			
			predicted_probs = []
			
			for line in fo:
				line = line.strip().split(",")
				newRow = []		
				for item in line:
					newRow.append(float(item))
					
				predicted_probs.append(clf.predict(newRow))
				
			fo.close()
			
			#predicted_probs = clf.predict(testScaled) 	
			#predicted_list.append([x[1] for x in predicted_probs])	
			dataset_blend_test_set[:, foldCount] = predicted_probs #[0]
			gc.collect()
				
			foldCount = foldCount + 1
		
		dataset_blend_test[:,ExecutionIndex] = dataset_blend_test_set.mean(1)  
		
		#print "Saving NP"
		#np.savetxt('temp/dataset_blend_test_set.txt', dataset_blend_test_set)
		#np.savetxt('temp/dataset_blend_test_set.mean.txt', dataset_blend_test_set.mean(1) )
		#np.savetxt('temp/dataset_blend_test.txt', dataset_blend_test)
		#print "Done Saving NP"
		
		now = datetime.datetime.now()
		#print dataset_blend_test_set.mean(1) 
		csv_io.write_delimited_file_single("../predictions/Stack_" + now.strftime("%Y%m%d%H%M%S") + "_" + str(avg) + "_" + str(clf)[:12] + ".csv", dataset_blend_test_set.mean(1))
		
		csv_io.write_delimited_file_single("../predictions/Target_Stack_" + now.strftime("%Y%m%d%H%M%S") + "_" + str(avg) + "_" + str(clf)[:12] + ".csv", dataset_blend_train[:,ExecutionIndex] )		
		
		csv_io.write_delimited_file("../predictions/RunLog.csv", [now.strftime("%Y %m %d %H %M %S"), "AVG." , str(avg), str(clf), "Folds:", str(NumFolds), "Model", model, "", ""], filemode="a",delimiter=",")
		
		
		print "------------------------Average: ", avg

		#np.savetxt('temp/dataset_blend_train.txt', dataset_blend_train)

	return dataset_blend_train, dataset_blend_test
Example #35
0
def PreProcess3():

    trainBase = csv_io.read_data("PreProcessData/training_PreProcess2.csv",
                                 split="\t",
                                 skipFirstLine=False)
    test = csv_io.read_data("PreProcessData/test_PreProcess2.csv",
                            split="\t",
                            skipFirstLine=False)
    weights = csv_io.read_data("PreProcessData/Weights.csv",
                               skipFirstLine=False)

    print "Train Size: ", len(trainBase[0]), "Test Size: ", len(test[0])

    shutil.copy2("PreProcessData/DataClassList2.csv",
                 "PreProcessData/DataClassList3.csv")

    lat = len(trainBase[0]) - 2
    long = len(trainBase[0]) - 1

    target = [x[0] for x in trainBase]
    train = [x[lat:long + 1] for x in trainBase]

    n_neighborsArr = [5]
    leaf_sizeArr = [30]
    for n_neighbor in n_neighborsArr:
        for leaf_s in leaf_sizeArr:

            print "Training neighbors: ", n_neighbor, "leaf_size: ", leaf_s

            neigh = KNeighborsRegressor(n_neighbors=n_neighbor,
                                        warn_on_equidistant=False,
                                        leaf_size=leaf_s,
                                        algorithm="ball_tree",
                                        weights=myFunc)
            neigh.fit(train, target)

            probSum = 0
            weightSum = 0

            for index, data in enumerate(trainBase):
                pred = neigh.predict([data[lat], data[long]])
                #print data[lat], data[long], "Prediction: ", pred[0], "Target: ", target[index]
                if (len(n_neighborsArr) == 1):
                    trainBase[index].append(pred[0])

                probSum += weights[index][0] * math.fabs(target[index] -
                                                         pred[0])
                weightSum += weights[index][0]

            print "Score: ", probSum / weightSum
            if (len(n_neighborsArr) > 1):
                continue

            for index, data in enumerate(test):
                pred = neigh.predict([data[lat - 1], data[long - 1]])
                #print data[lat - 1], data[long - 1], "Prediction: ", pred[0]
                if (len(n_neighborsArr) == 1):
                    test[index].append(pred[0])

    if (len(n_neighborsArr) > 1):
        return

    with open("PreProcessData/DataClassList3.csv", "a") as myfile:
        myfile.write("Lat-Long-Predictor\n")

    print "Writing Data"
    csv_io.write_delimited_file("PreProcessData/training_PreProcess3.csv",
                                trainBase,
                                delimiter="\t")
    csv_io.write_delimited_file("PreProcessData/test_PreProcess3.csv",
                                test,
                                delimiter="\t")
    print "Done."
Example #36
0
def main():

    trainBase = csv_io.read_data("PreProcessData/PreProcess2.csv", False)

    avg = 0
    NumFolds = 5  # should be odd for median

    predicted_list = []

    spanDistance = 12
    bootstrapLists = []

    NeighborsArray = [10]
    for Neighbors in NeighborsArray:

        predicted_list = []

        Folds = cross_validation.KFold(len(trainBase) - 1,
                                       k=NumFolds,
                                       indices=True,
                                       shuffle=False,
                                       random_state=None)
        for train_index, test_index in Folds:

            trainBaseTemp = [trainBase[i + 1] for i in train_index]
            #trainBaseTemp = trainBase
            target = [x[0] for x in trainBaseTemp]
            train = [x[1:] for x in trainBaseTemp]

            testBaseTemp = [trainBase[i + 1] for i in test_index]
            #testBaseTemp = trainBase
            targetTest = [x[0] for x in testBaseTemp]
            trainTest = [x[1:] for x in testBaseTemp]

            test = csv_io.read_data("PreProcessData/PreTestData2.csv", False)
            test = [x[0:] for x in test]

            kn = neighbors.KNeighborsClassifier(n_neighbors=Neighbors,
                                                weights='distance',
                                                algorithm='brute',
                                                leaf_size=100,
                                                warn_on_equidistant=True,
                                                p=2)

            kn.fit(train, target)
            prob = kn.predict(trainTest)

            prob = SimpleScale(prob)  # scale output probababilities

            probSum = 0
            totalOffByHalf = 0
            totalPositive = 0
            totalPositiveOffByHalf = 0
            totalPositivePredictions = 0

            for i in range(0, len(prob)):
                probX = prob[i][1]  # [1]
                if (probX > 0.999):
                    probX = 0.999
                if (probX < 0.001):
                    probX = 0.001
                #print i, probSum, probX, targetTest[i]
                #print target[i]*log(probX), (1-target[i])*log(1-probX)
                probSum += int(targetTest[i]) * log(probX) + (
                    1 - int(targetTest[i])) * log(1 - probX)
                if (math.fabs(probX - int(targetTest[i])) > 0.5):
                    totalOffByHalf = totalOffByHalf + 1

                if (int(targetTest[i]) == 1):
                    totalPositive = totalPositive + 1
                if (int(targetTest[i]) == 1 and probX < 0.5):
                    totalPositiveOffByHalf = totalPositiveOffByHalf + 1
                if (probX > 0.5):
                    totalPositivePredictions = totalPositivePredictions + 1

            print "Total Off By > 0.5 ", totalOffByHalf
            print "Total Positive ", totalPositive
            print "Total Positive Off By Half ", totalPositiveOffByHalf
            print "Total Positive Predictions ", totalPositivePredictions
            print "Neighbors: ", Neighbors
            print -probSum / len(prob)

            avg += (-probSum / len(prob)) / NumFolds

            predicted_probs = kn.predict(test)  # was test

            prob = SimpleScale(prob)  # scale output probababilities

            predicted_list.append([x[1] for x in predicted_probs])

        avg_list = []
        med_list = []

        # For N folds, get the average/median for each prediction item in test set.
        for p in range(0, len(test)):
            temp_list = []
            for q in range(0, len(predicted_list)):
                temp_list.append(predicted_list[q][p])

            avg_list.append(mean(temp_list))
            med_list.append(getMedian(temp_list))

            #print p, q, temp_list, mean(temp_list), getMedian(temp_list)

        bootstrapLists.append(avg_list)

    # This would be used if we ran multiple runs with different training values.
    # Primitive stacking, should rather save data, and do formal stacking.
    if (len(bootstrapLists) > 1):
        finalList = []
        for p in range(0, len(test)):
            temp_list = []
            for q in range(0, len(bootstrapLists)):
                temp_list.append(bootstrapLists[q][p])

            finalList.append(meanSpan(temp_list, spanDistance))

            print p, q, temp_list, meanSpan(temp_list, spanDistance)
    else:
        finalList = bootstrapLists[0]

    avg_values = ["%f" % x for x in finalList]
    csv_io.write_delimited_file("../Submissions/rf2_stack_avg.csv", avg_values)

    print "Average: ", avg

    var = raw_input("Enter to terminate.")
Example #37
0
def PreProcessRun(dataSet):
	print
	print "DataSet: ", dataSet
	
	if ( os.path.exists("PreProcessData/" + dataSet + "_PreProcess.csv") ):
		os.remove("PreProcessData/" + dataSet + "_PreProcess.csv")
	
	
	DataClassList = []
	
	f1 = open("../" + dataSet + "/" + dataSet + "_filev1.csv", 'r')
	f2 = open("PreProcessData/" + dataSet + "_PreProcess_temp.csv", 'w')
	for line in f1:
		newLine = ""
		gotQuote = False
		for c in line:
			if ( c == "\"" and gotQuote == False ):
				gotQuote = True
			elif ( c == "\"" and gotQuote == True ):			
				gotQuote = False
				
			if ( gotQuote == True and  c == ","):	
				continue
			elif(gotQuote == True):
				newLine += c
			else:
				if ( c == ","):
					newLine += "\t"
				else:
					newLine += c
				
		
		f2.write(newLine)
	f1.close()
	f2.close()

	
	data = csv_io.read_data("PreProcessData/" + dataSet + "_PreProcess_temp.csv", split="\t" ,skipFirstLine = False)

	weights = []
	first = True
	if ( dataSet == "training"):
		for row in data:
			if ( first  == True ) :
				first = False
				continue
			weights.append([row[13]])
			#print row[13]
		csv_io.write_delimited_file("PreProcessData/Weights.csv", weights)
	

	
	data = csv_io.read_data("PreProcessData/training_PreProcess_temp.csv", split="\t" ,skipFirstLine = True)	
	meanSum = [0.0] * 200
	meanCount = [0] * 200
	for index, val in enumerate(meanSum):
		meanCount[index] = 0
		meanSum[index] = 0.0
	
	for row in data:
		for index, val in enumerate(row):
			if ( isinstance(val, float) and val != 0.0):
				meanCount[index] += 1
				meanSum[index] += val
			#else:
				#print "skip: ", val
	
	for index, val in enumerate(meanSum):
		if meanCount[index] > 0:
			meanSum[index] = meanSum[index]/float(meanCount[index])
	

	data = csv_io.read_data("PreProcessData/" + dataSet + "_PreProcess_temp.csv", split="\t" ,skipFirstLine = False)
	SkipArr = [0,2,4,171]
		
	for index, item in enumerate(data[0]):
		#print item
		if index in SkipArr:
			continue
		if "MOE_" in item:
			print "MOE_", item
			SkipArr.append(index)
			continue
		if ( index == 170 ):
			#DataClassList.insert(0, item)
			continue
		else:
			DataClassList.append(item)
			continue
	print "Len: ", len(data[0])

	first = True
	for item in data:
		#print item
		if ( first == True ):
			first = False
			continue
	
		rowNew = []

		for index, val in enumerate(item):
			if index in SkipArr:
				continue
			# in training this is the target value(append to beginning ), and in test this is the weight (just skip it) 
			if ( index == 170):
				#print "prepend", val
				if dataSet == "training":
					rowNew.insert(0, val)
				continue
			
		
			if ( val == "" or val == "NA" or val == "0" or val == "0.0" or val == 0 or val == 0.0):
				rowNew.append(meanSum[index]) 
			elif isinstance(val, str):
				rowNew.append(toFloat(val.replace("$", "")))	
			else:
				rowNew.append(val)
		
		csv_io.write_delimited_file("PreProcessData/" + dataSet + "_PreProcess.csv", [copy.deepcopy(rowNew)], filemode="a", delimiter="\t")


	
	csv_io.write_delimited_file("PreProcessData/DataClassList.csv", DataClassList)

	
	print "Done."		
Example #38
0
def main():
    current = strftime("%Y%m%d", gmtime())
    trainfilename = os.path.join(os.path.dirname(__file__), 'data', 'train.csv')
    testfilename = os.path.join(os.path.dirname(__file__), 'data', 'test.csv')
    train_X = pd.read_csv(trainfilename)
    print "Basic info on training data"
    print len(train_X)
    print len(train_X.columns)
    print train_X.columns
    
    train_Y = train_X.take([1], axis=1)    
    # print train_X.columns
    orig_test_X = pd.read_csv(testfilename)
    test_X = orig_test_X
    
        
    #Binaries
    train_X["Sex"] = train_X["Sex"].apply(lambda sex: 0 if sex == "male" else 1)
    test_X["Sex"] = test_X["Sex"].apply(lambda sex: 0 if sex == "male" else 1)

    #Remove
    train_X = train_X.drop('Embarked',1)
    test_X = test_X.drop('Embarked',1)
    train_X = train_X.drop('Ticket',1)
    test_X = test_X.drop('Ticket',1)
    train_X["Name"] = train_X["Name"].apply(lambda name: nameparser(name))
    test_X["Name"] = test_X["Name"].apply(lambda name: nameparser(name))
    train_X = train_X.drop('Cabin',1)
    test_X = test_X.drop('Cabin',1)
    train_X["alone"] = train_X.apply(alone, axis=1)
    test_X["alone"] = test_X.apply(alone, axis=1)
    train_X = train_X.drop('SibSp',1)
    test_X = test_X.drop('SibSp',1)
    train_X = train_X.drop('Parch',1)
    test_X = test_X.drop('Parch',1)
    
    for name in train_X['Name'].unique():
        print "For name " + str(name)
        imp = Imputer(missing_values='NaN', strategy='mean', axis=1)
        features = pd.concat([train_X[train_X['Name'] == name]['Age'],test_X[test_X['Name'] == name]['Age']]).values
        imp.fit(features)
        features = train_X[train_X['Name'] == name]['Age'].values        
        train_X.loc[train_X.Name == name,'Age'] = list(imp.transform(features)[0])        
        print np.std(train_X[train_X['Name'] == name]['Age'])
        print np.mean(train_X[train_X['Name'] == name]['Age'])
        
        features = test_X[test_X['Name'] == name]['Age'].values
        test_X.loc[test_X.Name == name,'Age'] = list(imp.transform(features)[0])
        print np.std(test_X[test_X['Name'] == name]['Age'])
        print np.mean(test_X[test_X['Name'] == name]['Age'])


    train_X["woman_child_man"] = train_X.apply(lambda row: woman_child_or_man(row), axis=1)
    test_X["woman_child_man"] = test_X.apply(lambda row: woman_child_or_man(row), axis=1)
    train_X = train_X.drop('Name',1)
    test_X = test_X.drop('Name',1)


    newcolumns= ["woman_child_man"]
    
    train_one_hot_X, test_one_hot_X = one_hot_dataframe(train_X, test_X, newcolumns, replace=True)
    train_X = train_one_hot_X
    test_X = test_one_hot_X

    
    print train_X.columns
    print test_X.columns
    train_X = train_X.drop('PassengerId',1)
    test_X = test_X.drop('PassengerId',1)
    
    # http://triangleinequality.wordpress.com/2013/05/19/machine-learning-with-python-first-steps-munging/
    #Age through Imputation performed already#
    
    # Fare #
    #Fare imputation may not help: see http://nbviewer.ipython.org/gist/mwaskom/8224591
    train_X.Fare = train_X.Fare.map(lambda x: np.nan if x==0 else x)
    test_X.Fare = test_X.Fare.map(lambda x: np.nan if x==0 else x)
    classmeans = pd.concat([train_X, test_X]).pivot_table('Fare', rows='Pclass', aggfunc='median')
    # classmeans = test_X.pivot_table('Fare', rows='Pclass', aggfunc='mean')
    train_X.Fare = train_X[['Fare', 'Pclass']].apply(lambda x: classmeans[x['Pclass']] if pd.isnull(x['Fare']) else x['Fare'], axis=1 )
    test_X.Fare = test_X[['Fare', 'Pclass']].apply(lambda x: classmeans[x['Pclass']] if pd.isnull(x['Fare']) else x['Fare'], axis=1 )

    
    train_X.to_csv(os.path.join(os.path.dirname(__file__), 'data', "simple_processed_train_data_est_{0}.csv".format(current)))
    test_X.to_csv(os.path.join(os.path.dirname(__file__), 'data', "simple_processed_test_data_est_{0}.csv".format(current)))
    train_X = train_X.drop('Survived',1)
    

    print "Finished reading data"
    
    train = train_X.values.astype(np.float)
    target = np.ravel(train_Y.values.astype(np.float))

    # random forest code
    forest = RandomForestClassifier(n_estimators = 100)
    forest = forest.fit(train, target)
    if True:
        from sklearn import cross_validation
        scores = cross_validation.cross_val_score(forest, train, target, cv=10)
        print scores
    
    if False:
        from sklearn.grid_search import GridSearchCV
        forest = ExtraTreesClassifier(bootstrap=True,oob_score=True,random_state=42)
        max_features_choices = [n * 0.1 for n in range(1,10)] 
        n_ests=[100, 200, 500, 1000]
        gs = GridSearchCV(forest, {'max_features': max_features_choices,'n_estimators': n_ests}, scoring = 'accuracy', cv = 10, n_jobs=-1)
        gs.fit(train, target)
        print "Score {0} with params {1}".format(gs.best_score_, gs.best_params_)

        print('fitting the model')
        forest = ExtraTreesClassifier(**gs.best_params_)
        forest.fit(train, target)

    
    # run model against test data
    predicted_class = forest.predict(test_X.values.astype(np.float))
    # print predicted_class[0:9]
    # print(len(predicted_class))

    predicted_class = ["%d,%d" % (orig_test_X.values[i,0], predicted_class[i]) for i in range(len(predicted_class))]
    
    csv_io.write_delimited_file(os.path.join(os.path.dirname(__file__), 'results', "simple_random_forest_solution_est_{0}.csv".format(current)), predicted_class, header=['PassengerId', 'Survived'])

    print ('Finished. Exiting.')
Example #39
0
File: ridge.py Project: mb16/Kaggle
def main():

    startCol = 0
    endCol = 50  # max = 1775

    train = csv_io.read_data("../Data/train.csv")
    target = [x[0] for x in train][1:3000]
    targetTest = [x[0] for x in train][3001:]
    trainTest = [x[startCol+1:endCol+1] for x in train][3001:]
    test = csv_io.read_data("../Data/test.csv")
    test = [x[startCol:endCol] for x in test]
	
    train = [x[startCol+1:endCol+1] for x in train][1:3000]	
	
    fo = open("knn_stats.txt", "a+")

    rf = RidgeClassifier(alpha=0.01, fit_intercept=True, normalize=False, copy_X=True, tol=0.001) 
	
    rf.fit(train, target)
    prob = rf.predict(trainTest) # changed from test


    result = 100
    probSum = 0
    for i in range(0, len(prob)):
        probX = prob[i] # [1]
        if ( probX > 0.7):
            probX = 0.7;		
        if ( probX < 0.3):
            probX = 0.3;
        print i, probSum, probX, target[i]
        print target[i]*log(probX), (1-target[i])*log(1-probX)
        probSum += targetTest[i]*log(probX)+(1-targetTest[i])*log(1-probX)
	
        #print probSum	
        #print len(prob)	
        #print "C: ", 10**C, " gamma: " ,2**g
        print -probSum/len(prob)
	

	
    if ( -probSum/len(prob) < result ):
        result = -probSum/len(prob)
        predicted_probs = rf.predict(test)  # was test
        predicted_probs = ["%f" % x for x in predicted_probs]
        csv_io.write_delimited_file("../Submissions/knn.csv", predicted_probs)
        print "Generated Data!!"
		
    #fo.write(str(5) + str(5)+ str(5));
		
    fo.close()
		
    #csv_io.write_delimited_file("../Submissions/rf_benchmark_test2.csv", predicted_probs)

    #predicted_probs = rf.predict_proba(train) # changed from test
 
    #predicted_probs = ["%f" % x[1] for x in predicted_probs]
    #predicted_probs = rf.predict(train) # changed from test
    #predicted_probs = ["%f" % x for x in predicted_probs]	
	
    #csv_io.write_delimited_file("../Submissions/rf_benchmark_train2.csv", predicted_probs)
	
	
    var = raw_input("Enter to terminate.")								
Example #40
0
File: knn.py Project: mb16/Kaggle
def main():

    startCol = 0
    endCol = 1775  # max = 1775

    train = csv_io.read_data("../Data/train.csv")
    target = [x[0] for x in train][1:3000]
    targetTest = [x[0] for x in train][3001:]
    trainTest = [x[startCol+1:endCol+1] for x in train][3001:]
    test = csv_io.read_data("../Data/test.csv")
    test = [x[startCol:endCol] for x in test]
	
    train = [x[startCol+1:endCol+1] for x in train][1:3000]	
	
    fo = open("knn_stats.txt", "a+")

	
	#n_neighbors=15, weights='distance' return 0.65
	#n_neighbors=3, weights='distance' 0.60
    rf = neighbors.KNeighborsClassifier(n_neighbors=3, weights='distance', algorithm='brute', leaf_size=100, warn_on_equidistant=True, p=2) # 'distance'
	
    rf.fit(train, target)
    prob = rf.predict(trainTest) # changed from test


    result = 100
    probSum = 0
    for i in range(0, len(prob)):
        probX = prob[i] # [1]
        if ( probX > 0.9):
            probX = 0.9;		
        if ( probX < 0.1):
            probX = 0.1;
        print i, probSum, probX, target[i]
        print target[i]*log(probX), (1-target[i])*log(1-probX)
        probSum += targetTest[i]*log(probX)+(1-targetTest[i])*log(1-probX)
	
        #print probSum	
        #print len(prob)	
        #print "C: ", 10**C, " gamma: " ,2**g
        print -probSum/len(prob)
	

	
    if ( -probSum/len(prob) < result ):
        result = -probSum/len(prob)
        predicted_probs = rf.predict(test)  # was test
        predicted_probs = ["%f" % x for x in predicted_probs]
        csv_io.write_delimited_file("../Submissions/knn.csv", predicted_probs)
        print "Generated Data!!"
		
    #fo.write(str(5) + str(5)+ str(5));
		
    fo.close()
		
    #csv_io.write_delimited_file("../Submissions/rf_benchmark_test2.csv", predicted_probs)

    #predicted_probs = rf.predict_proba(train) # changed from test
 
    #predicted_probs = ["%f" % x[1] for x in predicted_probs]
    #predicted_probs = rf.predict(train) # changed from test
    #predicted_probs = ["%f" % x for x in predicted_probs]	
	
    #csv_io.write_delimited_file("../Submissions/rf_benchmark_train2.csv", predicted_probs)
	
	
    var = raw_input("Enter to terminate.")								
Example #41
0
def main():
    
    trainfilename = os.path.join(os.path.dirname(__file__), 'data', 'train.csv')
    testfilename = os.path.join(os.path.dirname(__file__), 'data', 'test.csv')
    train_X = pd.read_csv(trainfilename)
    print "Basic info on training data"
    print len(train_X)
    print len(train_X.columns)
    print train_X.columns
    
    train_Y = train_X.take([1], axis=1)
    train_X = train_X.drop('Survived',1)
    
    # print train_X.columns
    test_X = pd.read_csv(testfilename)


    #Binaries
    train_X["has_family"] = train_X.apply(family, axis=1)
    train_X["child"] = train_X.apply(child, axis=1)
    train_X["smallchild"] = train_X.apply(smallchild, axis=1)
    train_X["familysize"] = train_X.apply(familysize, axis=1)
    train_X["Sex"] = train_X["Sex"].apply(lambda sex: 0 if sex == "male" else 1)
        
    test_X["has_family"] = test_X.apply(family, axis=1)
    test_X["child"] = test_X.apply(child, axis=1)
    test_X["smallchild"] = test_X.apply(smallchild, axis=1)    
    test_X["familysize"] = test_X.apply(familysize, axis=1)
    test_X["Sex"] = test_X["Sex"].apply(lambda sex: 0 if sex == "male" else 1)


    #Categorical ==> Use one hot necoding
    onehot = True
    if onehot:
        newcolumns= []
        train_X["Embarked"] = train_X["Embarked"].apply(lambda port: selectembarkment(port))
        test_X["Embarked"] = test_X["Embarked"].apply(lambda port: selectembarkment(port))
        newcolumns.append("Embarked")
        train_X["fare2"] = train_X.apply(fare2, axis=1)
        test_X["fare2"] = test_X.apply(fare2, axis=1)
        newcolumns.append("fare2")
        train_X["nameinfo"] = train_X["Name"].apply(lambda name: nameinfo(name))
        test_X["nameinfo"] = test_X["Name"].apply(lambda name: nameinfo(name))
        newcolumns.append("nameinfo")
        train_X["Ticket"] = train_X["Ticket"].apply(lambda ticket: DeptCode(ticket))
        test_X["Ticket"] = test_X["Ticket"].apply(lambda ticket: DeptCode(ticket))
        newcolumns.append("Ticket")
        train_X["Name"] = train_X["Name"].apply(lambda name: nameparser(name))
        test_X["Name"] = test_X["Name"].apply(lambda name: nameparser(name))
        newcolumns.append("Name")
        train_X["Cabin"] = train_X["Cabin"].apply(lambda cabin: cabinparser(cabin))
        test_X["Cabin"] = test_X["Cabin"].apply(lambda cabin: cabinparser(cabin))
        newcolumns.append("Cabin")
        train_one_hot_X, test_one_hot_X = one_hot_dataframe(train_X, test_X, newcolumns, replace=True)
        train_X = train_one_hot_X
        test_X = test_one_hot_X
    else:
        train_X["Embarked"] = train_X["Embarked"].apply(lambda port: selectembarkment(port))
        train_X["fare2"] = train_X["Fare"].apply(fare2, axis=1)
        train_X["nameinfo"] = train_X["Name"].apply(lambda name: nameinfo(name), axis=1)
        train_X["Ticket"] = train_X["Ticket"].apply(lambda ticket: DeptCode(ticket))
        train_X["Name"] = train_X["Name"].apply(lambda name: nameparser(name))
        train_X["Cabin"] = train_X["Cabin"].apply(lambda cabin: cabinparser(cabin))


        test_X["Embarked"] = test_X["Embarked"].apply(lambda port: selectembarkment(port))
        test_X["fare2"] = test_X["Fare"].apply(fare2, axis=1)
        test_X["nameinfo"] = test_X["Name"].apply(lambda name: nameinfo(name), axis=1)
        test_X["Ticket"] = test_X["Ticket"].apply(lambda ticket: DeptCode(ticket))    
        test_X["Name"] = test_X["Name"].apply(lambda name: nameparser(name))
        test_X["Cabin"] = test_X["Cabin"].apply(lambda cabin: cabinparser(cabin))


    imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
    imp.fit_transform(train_X['Age'], test_X['Age'])
    
    print "Finished reading data"
    train_X = train_X.fillna(-1)
    test_X = test_X.fillna(-1)

    # print train_X
    if False:
        for column in train_X.columns:
            print column, train_X[column]
    
    
    train_X = train_X.values.astype(np.float)
    test_X = test_X.values.astype(np.float)
    target = np.ravel(train_Y.values.astype(np.float))

    trees = ExtraTreesClassifier(n_estimators=100,bootstrap=True,oob_score=True)
    trees.fit(train_X, target)
    pd.DataFrame(trees.feature_importances_).plot(kind='bar')
    selected_features = np.where(trees.feature_importances_ > 0.02)[0] #0.005)[0]
    #0.005
    #[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 15]
    #0.01
    #[ 0  1  3  4  5  6  7  8  9 10 11 12 13 15]
    #0.02
    #[ 0  1  3  4  5  6  7  8  9 10 12 13 15]
    #0.05
    # [0 1 3 4 8]
    
    print selected_features
    
    train_selected_X = train_X[:, selected_features]
    test_selected_X = test_X[:, selected_features]

    # random forest code
    clf = svm.SVC()
    kernels = ['poly'] #, 'rbf', 'sigmoid']
    degs=[2,3]
    # gammas = [1e-4, 3e-4, 1e-3, 3e-3, 1e-2, 3e-2, 1e-1, 3e-1, 1., 3., 10.]
    gammas = [1e-3]#, 1e-3, 1e-1, 1.]
    gs = GridSearchCV(clf, {'kernel': kernels,'degree': degs, 'gamma': gammas}, scoring = 'accuracy', cv = 10)
    gs.fit(train_selected_X, target)
    print "Score {0} with params {1}".format(gs.best_score_, gs.best_params_)

    print('fitting the model')
    clf = svm.SVC(**gs.best_params_)
    clf.fit(train_selected_X, target)

    
    # run model against test data
    predicted_class = clf.predict(test_selected_X)
    # print predicted_class[0:9]
    # print(len(predicted_class))

    predicted_class = ["%d,%d" % (test_selected_X[i,0], predicted_class[i]) for i in range(len(predicted_class))]
    current = strftime("%Y%m%d", gmtime())
    
    csv_io.write_delimited_file(os.path.join(os.path.dirname(__file__), 'results', "svm_0.02_solution_est_{0}.csv".format(current)), predicted_class, header=['PassengerId', 'Survived'])

    #0.02
    #Trying sig, rbg, poly with degree 3 on 1e-3 and 1e-4
    # Score 0.760942760943 with params {'kernel': 'poly', 'gamma': 0.001, 'degree': 3} all kernels degree 3
    # real    408m13.291s
    #Score on kaggle 0.74163
    # all polynomial 1-4
    # Score 0.772166105499 with params {'kernel': 'poly', 'gamma': 0.001, 'degree': 2}
    # real 1077m38.691s
    #Score on kaggle 0.76555
    #0.05 for comparison
    # all polynomial 1-4
    # Score 0.763187429854 with params {'kernel': 'poly', 'gamma': 0.001, 'degree': 3}
    # real    340m42.558s
    # Your submission scored 0.74641,
    #Fixed features and implemented one hot coding
    # 0.02 polynomial 2 and 3 with 1e-3
    # real    283m12.476s
    # Score 0.772166105499 with params {'kernel': 'poly', 'gamma': 0.001, 'degree': 2}
    # our submission scored 0.75598,
    print ('Finished. Exiting.')
Example #42
0
def PreProcess5():

    #note, 275 represents too much data, and the scaler fails with an exception.

    trainBase = csv_io.read_data("PreProcessData/training_PreProcess5_250.csv",
                                 skipFirstLine=False,
                                 split="\t")
    test = csv_io.read_data("PreProcessData/test_PreProcess5_250.csv",
                            skipFirstLine=False,
                            split="\t")
    #shutil.copy2("PreProcessData/DataClassList5.csv", "PreProcessData/DataClassList6.csv")

    target = [x[0] for x in trainBase]
    train = [x[1:] for x in trainBase]

    DataClassList = csv_io.read_data(
        "PreProcessData/DataClassList_Importances_250.csv", False)

    print "Data len: ", len(train[0])
    print "DataClassList len: ", len(DataClassList)
    #return

    # this seems about optimal, but has not been tuned on latest improvements.
    NumFeatures = 40
    # NOTE going from 30 to 20 features on KNN5 set has almost no effect.  Down to 15 is significant loss.
    # for GBM at 6 and 400 30 is 3.01 and 30 3.05.

    print "Scaling"
    targetPre = [x[0] for x in trainBase][0:10000]
    print "Scaling1"
    trainPre = [x[1:] for x in trainBase][0:10000]
    #testPre = [x[0:] for x in test]
    print "Scaling2"
    scaler = preprocessing.Scaler().fit(trainPre)
    print "Scaling3"
    trainScaled = scaler.transform(trainPre)
    #testScaled = scaler.transform(testPre)

    #clf = RandomForestRegressor(n_estimators=25, n_jobs=1,compute_importances=True)

    #gc.collect()

    print "Prep Classes"

    # prep for usage below...
    DataClassListTemp = []
    for DataIndex, DataClass in enumerate(DataClassList):
        DataClassListTemp.append([DataClass[0], 0])

    DataClassList = DataClassListTemp

    reduceBy = 5
    totalFeatures = len(trainPre[0])

    trainNew = []
    testNew = []

    print "Processing"
    while (totalFeatures > NumFeatures):

        if (totalFeatures - NumFeatures < 40):
            reduceBy = 3
        if (totalFeatures - NumFeatures < 20):
            reduceBy = 2
        if (totalFeatures - NumFeatures < 10):
            reduceBy = 1

        if (totalFeatures - NumFeatures < reduceBy):
            reduceBy = totalFeatures - NumFeatures
            print "Reduce Features: ", reduceBy

        print "Training"
        clf = GradientBoostingRegressor(loss='ls',
                                        learn_rate=0.05,
                                        subsample=0.5,
                                        max_depth=6,
                                        n_estimators=400,
                                        random_state=166,
                                        min_samples_leaf=30)
        clf.fit(trainScaled, targetPre)

        print "Computing Importances"
        importances = clf.feature_importances_
        #print importances
        importancesSorted = sorted(importances, reverse=True)
        #print importancesSorted
        threshold = importancesSorted[len(importancesSorted) - reduceBy]
        print threshold
        #trainScaled = clf.transform(trainScaled, threshold) # only exists in RF

        trainScaledNew = []
        for row in trainScaled:
            newRow = []
            for impIndex, importance in enumerate(importances):
                if (importance > threshold):
                    newRow.append(row[impIndex])
            trainScaledNew.append(newRow)

        trainScaled = trainScaledNew

        print "Cols:", len(trainScaled)
        print "Rows:", len(trainScaled[0])

        totalFeatures = totalFeatures - reduceBy
        print "Total Features:", totalFeatures

        trainNew = []
        testNew = []

        for row in train:
            newRow = []
            for impIndex, importance in enumerate(importances):
                if (importance > threshold):
                    newRow.append(row[impIndex])
            trainNew.append(newRow)

        train = trainNew

        for row in test:
            newRow = []
            for impIndex, importance in enumerate(importances):
                if (importance > threshold):
                    newRow.append(row[impIndex])
            testNew.append(newRow)

        test = testNew

        print "Train Cols:", len(train)
        print "Train Rows:", len(train[0])

        print "Test Cols:", len(test)
        print "Test Rows:", len(test[0])

        DataClassListNew = []
        for Index, importance in enumerate(importances):
            if (importance > threshold):
                print DataClassList[Index][0], importance
                DataClassListNew.append([DataClassList[Index][0], importance])

        DataClassList = DataClassListNew

        print "Data Transform Complete"

    # final steps, save data classes in new set

    csv_io.write_delimited_file(
        "PreProcessData/DataClassList_Importances_RFE2_" + str(NumFeatures) +
        ".csv", DataClassListNew)

    DataClassListNew_temp = sorted(DataClassListNew,
                                   key=operator.itemgetter(1),
                                   reverse=True)
    csv_io.write_delimited_file(
        "PreProcessData/DataClassList_Importances_RFE2_sorted_" +
        str(NumFeatures) + ".csv", DataClassListNew_temp)

    # prepend the target on each row.
    trainFinal = []

    rowIndex = 0
    for row in train:
        newRow = []
        for Index, val in enumerate(row):
            if (Index == 0):
                newRow.append(target[rowIndex])
            newRow.append(val)
        trainFinal.append(newRow)
        rowIndex += 1

    csv_io.write_delimited_file("PreProcessData/training_PreProcess6_RFE2_" +
                                str(NumFeatures) + ".csv",
                                trainFinal,
                                delimiter="\t")
    csv_io.write_delimited_file("PreProcessData/test_PreProcess6_RFE2_" +
                                str(NumFeatures) + ".csv",
                                testNew,
                                delimiter="\t")
Example #43
0
def PreProcess2():


	trainBase = csv_io.read_data("PreProcessData/training_PreProcess2_temp.csv", False)
	test = csv_io.read_data("PreProcessData/test_PreProcess2_temp.csv", False)

	target = [x[0] for x in trainBase]
	train = [x[1:] for x in trainBase]
	
	IndexList = [2,3,4,5,6]
	
	
	with open("PreProcessData/DataClassList.csv", "a") as myfile:
		myfile.write("\n")
	
	
	
	DataClassList = csv_io.read_data("PreProcessData/DataClassList.csv", False)
	
	#for myIndex in IndexList:	
	for myIndex in range(2,75):
	#for myIndex in range(2,len(train[0]) - 2):
		
		
		MTrain = []
		FTrain = []
		MTarget = []
		FTarget = []
		
		for index, data in enumerate(train):
			if ( data[0] == "0" ):
				MTrain.append([data[1], data[myIndex]])
				MTarget.append(target[index])
				#print "M", data[1], data[myIndex]
			if ( data[0] == "1" ):
				FTrain.append([data[1], data[myIndex]])	
				FTarget.append(target[index])
				#print "F", data[1], data[myIndex]
				
		#print MTrain			
		print len(MTrain), len(FTrain),len(MTarget), len(FTarget)

		# better than GradBoost, and much better than KNN
		Mneigh = RandomForestClassifier()
		Fneigh = RandomForestClassifier()

		Mneigh.fit(MTrain, MTarget) 
		Fneigh.fit(FTrain, FTarget) 
		

		#count = 0
		for index, data in enumerate(train):
			if ( data[0] == "0" ):
				pred = Mneigh.predict_proba([data[1], data[myIndex]])
				#print "M", data[1], data[myIndex], pred[0][1], target[index]
				trainBase[index].append(pred[0][1])
				#if ( str(pred[0][1]) == str(target[index])):
				#	count = count + 1
			if ( data[0] == "1" ):
				pred = Fneigh.predict_proba([data[1], data[myIndex]])
				#print "F", data[1], data[myIndex], pred[0][1], target[index]
				trainBase[index].append(pred[0][1])
				#if ( str(pred[0][1]) == str(target[index])):
				#	count = count + 1


		
		for index, data in enumerate(test):
			if ( data[0] == "0" ):
				pred = Mneigh.predict_proba([data[1], data[myIndex]])
				#print "M", data[1], data[myIndex], pred[0][1], target[index]
				test[index].append(pred[0][1])
			if ( data[0] == "1" ):
				pred = Fneigh.predict_proba([data[1], data[myIndex]])
				#print "F", data[1], data[myIndex], pred[0][1], target[index]
				test[index].append(pred[0][1])	
		

		print myIndex, len(train[0])	
	
	
		with open("PreProcessData/DataClassList.csv", "a") as myfile:
			myfile.write("RF_Gender-Age-Class_" + str(DataClassList[myIndex][0]) + "_" + str(myIndex) + "\n")

	print "Writing Data"
	csv_io.write_delimited_file("PreProcessData/training_PreProcess2_temp_a.csv", trainBase)		
	csv_io.write_delimited_file("PreProcessData/test_PreProcess2_temp_a.csv", test)
	print "Done."	
Example #44
0
def main():

    #random.seed(5)
    #random.random()
	
    startCol = 0
    endCol = 1775  # max = 1775

    trainBase = csv_io.read_data("../Data/train.csv")
	
    result = 100
    avg = 0
    bootstraps = 5 # should be odd for median
	
    rnd_start = 456
	

    predicted_list = []
	
    spanDistance = 12
    bootstrapLists = []
	
    # this feature set got 0.45, which is no improvement over a single rf run.
    #for m_features in [52,56,60,66,72,80,90,100,110,120,130,140,150,160,170,180,190,200,220,240,260,280,300,350,400,450,500,550,600,650]:
    for m_features in [0]:
        
        predicted_list = []
        #bs = cross_validation.Bootstrap(len(trainBase) - 1, n_bootstraps=bootstraps, train_size=0.7, random_state=0)
        bs = cross_validation.KFold(len(trainBase) - 1, k=5, indices=True, shuffle=False, random_state=None)
        for train_index, test_index in bs:

            trainBaseTemp = [trainBase[i+1] for i in train_index]
            #trainBaseTemp = trainBase
            target = [x[0] for x in trainBaseTemp]#[1001:3700]
            train = [x[1:] for x in trainBaseTemp]#[1001:3700]
	
            testBaseTemp = [trainBase[i+1] for i in test_index]
            #testBaseTemp = trainBase
            targetTest = [x[0] for x in testBaseTemp]#[1:1000]
            trainTest = [x[1:] for x in testBaseTemp]#[1:1000]
	
	
            test = csv_io.read_data("../Data/test.csv")
            test = [x[0:] for x in test]
	
	
            fo = open("rf_stats.txt", "a+")
    
	
            #rf = RandomForestClassifier(n_estimators=200, min_density=0.2, criterion="gini", random_state=rnd_start, max_features=m_features) # , max_features=None
            rf = RandomForestClassifier(n_estimators=200, criterion='gini', max_depth=None, min_samples_split=1, min_samples_leaf=1, min_density=0.10000000000000001, max_features='auto', bootstrap=True, compute_importances=False, oob_score=False, n_jobs=1, random_state=rnd_start, verbose=0) # , max_features=None

            rf.fit(train, target)
            prob = rf.predict_proba(trainTest)  # was test
	
            probSum = 0
            totalOffByHalf = 0
	
            for i in range(0, len(prob)):
                probX = prob[i][1] # [1]
                if ( probX > 0.999999999999):
                    probX = 0.999999999999;		
                if ( probX < 0.000000000001):
                    probX = 0.000000000001;
                #print i, probSum, probX, target[i]
                #print target[i]*log(probX), (1-target[i])*log(1-probX)
                probSum += targetTest[i]*log(probX)+(1-targetTest[i])*log(1-probX)
                if ( math.fabs(probX - targetTest[i]) > 0.5 ):
                    totalOffByHalf = totalOffByHalf + 1		
			
            print "Total Off By > 0.5 ", totalOffByHalf
            print "M-features: ", m_features
            print -probSum/len(prob)
	
            #fo.write(str(C) + "," + str(g) + "," + str(-probSum/len(prob)));
	
            avg += 	(-probSum/len(prob))/bootstraps

            predicted_probs = rf.predict_proba(test)  # was test
            predicted_list.append([x[1] for x in predicted_probs])
	
	
            fo.close()


        avg_list = []
        med_list = []
	
	
        for p in range(0, len(test)):
            temp_list =[]	
            for q in range(0, len(predicted_list)):		
		        temp_list.append(  predicted_list[q][p]) 
			
            avg_list.append( mean(temp_list) )
            med_list.append( getMedian(temp_list) )
		
            #print p, q, temp_list, mean(temp_list), getMedian(temp_list)
		
        bootstrapLists.append(avg_list)
		
    if ( len(bootstrapLists) > 1 ):
        finalList = []
        for p in range(0, len(test)):
            temp_list =[]	
            for q in range(0, len(bootstrapLists)):		
		        temp_list.append(  bootstrapLists[q][p]) 
			
            finalList.append( meanSpan(temp_list, spanDistance) )
		
            print p, q, temp_list, meanSpan(temp_list, spanDistance)
    else:
        finalList = bootstrapLists[0]    	
		
		
    avg_values = ["%f" % x for x in finalList]
    csv_io.write_delimited_file("../Submissions/rf2_stack_avg_benchmark.csv", avg_values)	
	
	
    print "Average: ", avg
		
    var = raw_input("Enter to terminate.")								
Example #45
0
def run_rf(SEED):

    target = pd.read_csv('../data/pre_shuffled_target.csv')
    target = np.ravel(target.values)

    weights = pd.read_csv('../data/weights.csv')
    weights = np.ravel(weights.values)

    trainBase = pd.read_csv('../data/pre_shuffled_train.csv')
    test = pd.read_csv('../data/pre_shuffled_test.csv')

    NumFeatures = 30
    clf = RandomForestRegressor(n_estimators=30,
                                criterion='mse',
                                max_depth=None,
                                min_samples_split=2,
                                min_samples_leaf=1,
                                max_features='auto',
                                bootstrap=True,
                                oob_score=False,
                                n_jobs=1,
                                random_state=None,
                                verbose=0,
                                min_density=None,
                                compute_importances=True)
    #clf = ExtraTreesRegressor(n_estimators=150, min_density=0.02, n_jobs=1, criterion='gini', bootstrap=True,compute_importances=True)
    print("Training")
    clf.fit(trainBase.values, target, sample_weight=weights)

    print("Computing Importances")
    importances = clf.feature_importances_
    print(importances)

    importancesSorted = sorted(importances, reverse=True)
    print(str(len(importancesSorted)) + " importances")

    threshold = 1.0
    if (len(importancesSorted) > NumFeatures):
        threshold = importancesSorted[NumFeatures]

    print("Threshold: " + str(threshold))

    DataClassListNew = []
    print(trainBase.columns.values)
    for DataIndex, DataClass in enumerate(trainBase.columns.values):
        print(
            str(DataIndex) + " " + DataClass + ", " +
            str(importances[DataIndex]))
        DataClassListNew.append([DataClass, importances[DataIndex]])

        if (importances[DataIndex] < threshold and DataClass != "id"
                and DataClass != "var11"):  # don't drop id or weights column.
            trainBase.drop([DataClass], axis=1, inplace=True)
            test.drop([DataClass], axis=1, inplace=True)

    csv_io.write_delimited_file(
        "../preprocessdata/DataClassList_Importances_RF.csv", DataClassListNew)

    submission = pd.DataFrame(trainBase)
    submission.to_csv("../data/pre_rf_train.csv", index=False)

    submission = pd.DataFrame(test)
    submission.to_csv("../data/pre_rf_test.csv", index=False)
Example #46
0
def main():

	trainBase = csv_io.read_data("PreProcessData/PreProcess2.csv", False)
	
	avg = 0
	NumFolds = 5 # should be odd for median

	predicted_list = []
	
	spanDistance = 12
	bootstrapLists = []
	
	
	CgList = [[0.0, -5.5]]
	

	for Cg in CgList:
		
		predicted_list = []

		Folds = cross_validation.KFold(len(trainBase) - 1, k=NumFolds, indices=True, shuffle=False, random_state=None)
		for train_index, test_index in Folds:

			trainBaseTemp = [trainBase[i+1] for i in train_index]
			#trainBaseTemp = trainBase
			target = [x[0] for x in trainBaseTemp]
			train = [x[1:] for x in trainBaseTemp]
	
			testBaseTemp = [trainBase[i+1] for i in test_index]
			#testBaseTemp = trainBase
			targetTest = [x[0] for x in testBaseTemp]
			trainTest = [x[1:] for x in testBaseTemp]
	
	
			test = csv_io.read_data("PreProcessData/PreTestData2.csv", False)
			test = [x[0:] for x in test]
	

			svc = svm.SVC(probability=True, C=10**Cg[0], gamma=2**Cg[1], cache_size=800, coef0=0.0, degree=3, kernel='rbf', shrinking=True, tol=0.001)
			
			svc.fit(train, target)
			prob = svc.predict_proba(trainTest) 
	
			prob = SimpleScale(prob) # scale output probababilities
	
			probSum = 0
			totalOffByHalf = 0
			totalPositive = 0
			totalPositiveOffByHalf = 0
			totalPositivePredictions = 0
			
			for i in range(0, len(prob)):
				probX = prob[i][1] # [1]
				if ( probX > 0.999):
					probX = 0.999;		
				if ( probX < 0.001):
					probX = 0.001;
				#print i, probSum, probX, targetTest[i]
				#print target[i]*log(probX), (1-target[i])*log(1-probX)
				probSum += int(targetTest[i])*log(probX)+(1-int(targetTest[i]))*log(1-probX)
				if ( math.fabs(probX - int(targetTest[i])) > 0.5 ):
					totalOffByHalf = totalOffByHalf + 1		
			
				if ( int(targetTest[i]) == 1 ):
					totalPositive = totalPositive + 1
				if ( int(targetTest[i]) == 1 and probX < 0.5):
					totalPositiveOffByHalf = totalPositiveOffByHalf + 1
				if (probX > 0.5):
					totalPositivePredictions = totalPositivePredictions + 1			
			
			print "Total Off By > 0.5 ", totalOffByHalf
			print "Total Positive ", totalPositive
			print "Total Positive Off By Half ", totalPositiveOffByHalf
			print "Total Positive Predictions ", totalPositivePredictions
			print "C/g: ", Cg[0], Cg[1]
			print -probSum/len(prob)
	
 
			avg += 	(-probSum/len(prob))/NumFolds

			predicted_probs = svc.predict_proba(test)  # was test
						
			prob = SimpleScale(prob) # scale output probababilities
						
			predicted_list.append([x[1] for x in predicted_probs])
				


		avg_list = []
		med_list = []
	
		# For N folds, get the average/median for each prediction item in test set.
		for p in range(0, len(test)):
			temp_list =[]	
			for q in range(0, len(predicted_list)):		
				temp_list.append(  predicted_list[q][p]) 
			
			avg_list.append( mean(temp_list) )
			med_list.append( getMedian(temp_list) )
		
			#print p, q, temp_list, mean(temp_list), getMedian(temp_list)
		
		bootstrapLists.append(avg_list)
		
	# This would be used if we ran multiple runs with different training values.
	# Primitive stacking, should rather save data, and do formal stacking.
	if ( len(bootstrapLists) > 1 ):
		finalList = []
		for p in range(0, len(test)):
			temp_list =[]	
			for q in range(0, len(bootstrapLists)):		
				temp_list.append(  bootstrapLists[q][p]) 
			
			finalList.append( meanSpan(temp_list, spanDistance) )
		
			print p, q, temp_list, meanSpan(temp_list, spanDistance)
	else:
		finalList = bootstrapLists[0]		
		
		
	avg_values = ["%f" % x for x in finalList]
	csv_io.write_delimited_file("../Submissions/rf2_stack_avg.csv", avg_values)	
	
	
	print "Average: ", avg
		
	var = raw_input("Enter to terminate.")								
Example #47
0
def run_stack(SEED):

    model = "Lasso"
    lossThreshold = 0.38

    trainBaseTarget = pd.read_csv('../preprocessdata/pre_shuffled_target.csv')
    trainBaseOrig = pd.read_csv('../models/' + model + '_train.csv')
    trainBaseWeight = trainBaseOrig['var11']
    testOrig = pd.read_csv('../models/' + model + '_test.csv')

    targetBase = np.nan_to_num(np.array(trainBaseTarget))

    trainBaseID = trainBaseOrig['id']
    testID = testOrig['id']

    avg = 0
    NumFolds = 5

    stackFiles = []
    for filename in os.listdir("../predictions"):
        parts = filename.split("_")
        if (filename[0:5] == "Stack" and float(parts[2]) > lossThreshold):

            stackFiles.append(filename)

    trainBase = np.zeros((len(trainBaseOrig), len(stackFiles)))
    test = np.zeros((len(testOrig), len(stackFiles)))

    print("Loading Data")
    for fileNum, file in enumerate(stackFiles):
        print(file)
        trn = csv_io.read_data(
            "../predictions/Target_" + file, split=",",
            skipFirstLine=True)  # skip first because of header.
        for row, datum in enumerate(trn):
            trainBase[row, fileNum] = datum[1]  # -1 because we skil

        tst = csv_io.read_data(
            "../predictions/" + file, split=",",
            skipFirstLine=True)  # skip first because of header.
        for row, datum in enumerate(tst):
            test[row, fileNum] = datum[1]

    np.savetxt('temp/dataset_blend_train.txt', trainBase)
    np.savetxt('temp/dataset_blend_test.txt', test)
    print("Num file processed: " + " " + str(len(stackFiles)) + " " +
          "Threshold: " + str(lossThreshold))

    print("Starting Scale")

    allVals = np.vstack((trainBase, test))

    scl = StandardScaler(copy=True, with_mean=True, with_std=True)
    scl.fit(allVals)  # should fit on the combined sets.

    trainBase = scl.transform(trainBase)
    test = scl.transform(test)

    print("Starting Blend")

    clfs = [
        #GradientBoostingRegressor(loss='ls', learning_rate=0.05, subsample=0.5, max_depth=10, n_estimators=30, random_state=166, min_samples_leaf=1),
        Lasso(alpha=0.000016681005372000593),
        #Ridge(),
        #LinearRegression(fit_intercept=True, normalize=False, copy_X=True)
    ]

    print("Data size: " + str(len(trainBase)) + " " + str(len(test)))
    dataset_blend_train = np.zeros((len(trainBase), len(clfs)))
    dataset_blend_test = np.zeros((len(test), len(clfs)))

    print("Begin Training")

    lenTrainBase = len(trainBase)
    lenTest = len(test)

    gc.collect()

    for ExecutionIndex, clf in enumerate(clfs):
        print(clf)
        avg = 0

        dataset_blend_test_set = np.zeros((lenTest, NumFolds))

        foldCount = 0

        Folds = cross_validation.KFold(lenTrainBase,
                                       n_folds=NumFolds,
                                       indices=True)

        for train_index, test_index in Folds:

            print()
            print("Iteration: " + str(foldCount))

            now = datetime.datetime.now()
            print(now.strftime("%Y/%m/%d %H:%M:%S"))

            target = [targetBase[i] for i in train_index]
            train = [trainBase[i] for i in train_index]
            weight = [trainBaseWeight[i] for i in train_index]

            targetTest = [targetBase[i] for i in test_index]
            trainTest = [trainBase[i] for i in test_index]
            weightTest = [trainBaseWeight[i] for i in test_index]

            #print "LEN: ", len(train), len(target)

            target = np.array(np.reshape(target, (-1, 1)))
            #train = np.array(np.reshape(train, (-1, 1))  )
            weight = np.array(np.reshape(weight, (-1, 1)))

            targetTest = np.array(np.reshape(targetTest, (-1, 1)))
            #trainTest = np.array(np.reshape(trainTest, (-1, 1)) )
            weightTest = np.array(np.reshape(weightTest, (-1, 1)))

            #clf.fit(train, target, sample_weight = weight
            clf.fit(train, target)
            predicted = clf.predict(trainTest)
            #print(predicted[:,0])
            print(predicted)
            dataset_blend_train[
                test_index,
                ExecutionIndex] = predicted  #[:,0] #needed for Ridge

            #print(targetTest.shape)
            #print(prpredictedob.shape)
            #print(weightTest.shape)

            print(
                str(
                    score.normalized_weighted_gini(targetTest.ravel(),
                                                   predicted.ravel(),
                                                   weightTest.ravel())))
            avg += score.normalized_weighted_gini(
                targetTest.ravel(), predicted.ravel(),
                weightTest.ravel()) / NumFolds
            #print(str(score.normalized_weighted_gini(targetTest.ravel(), predicted[:,0], weightTest.ravel())))
            #avg += score.normalized_weighted_gini(targetTest.ravel(), predicted[:,0], weightTest.ravel())/NumFolds

            predicted = clf.predict(test)
            dataset_blend_test_set[:, foldCount] = predicted  #[:,0]

            foldCount = foldCount + 1

            #break

        dataset_blend_test[:, ExecutionIndex] = dataset_blend_test_set.mean(1)

        now = datetime.datetime.now()
        #print dataset_blend_test_set.mean(1)
        #csv_io.write_delimited_file_single("../predictions/Stack_" + now.strftime("%Y%m%d%H%M%S") + "_" + str(avg) + "_" + str(clf)[:12] + ".csv", dataset_blend_test_set.mean(1))

        submission = pd.DataFrame(np.zeros((len(testID), 2)),
                                  columns=['id', 'target'])
        submission['target'] = dataset_blend_test[:, ExecutionIndex]
        submission['id'] = testID
        submission.to_csv("../submission/Blend_" +
                          now.strftime("%Y%m%d%H%M%S") + "_" + str(avg) + "_" +
                          str(clf)[:12] + ".csv",
                          index=False)

        #csv_io.write_delimited_file_single("../predictions/Target_Stack_" + now.strftime("%Y%m%d%H%M%S") + "_" + str(avg) + "_" + str(clf)[:12] + ".csv", dataset_blend_train[:,ExecutionIndex] )

        submission = pd.DataFrame(np.zeros((len(trainBaseID), 2)),
                                  columns=['id', 'target'])
        submission['target'] = dataset_blend_train[:, ExecutionIndex]
        submission['id'] = trainBaseID
        submission.to_csv("../submission/Target_Blend_" +
                          now.strftime("%Y%m%d%H%M%S") + "_" + str(avg) + "_" +
                          str(clf)[:12] + ".csv",
                          index=False)

        csv_io.write_delimited_file("../log/RunLogBlend.csv", [
            now.strftime("%Y %m %d %H %M %S"), "AVG.",
            str(avg),
            str(clf), "Folds:",
            str(NumFolds), "Model", "Blend", "Stacks: ", stackFiles
        ],
                                    filemode="a",
                                    delimiter=",")

        print("------------------------Average: " + str(avg))

        #np.savetxt('temp/dataset_blend_train.txt', dataset_blend_train)

    return dataset_blend_train, dataset_blend_test
Example #48
0
def preprocess():

	train, test = util.get_train_test_df()

	
	columns = set(train.columns)
	#columns.remove("SalesID")
	#columns.remove("SalePrice")
	#columns.remove("saledate")

	#train_fea = get_date_dataframe(train["saledate"])
	#test_fea = get_date_dataframe(test["saledate"])

	#parseColumns = ["UsageBand"]
	parseColumns = [ "UsageBand","fiBaseModel","fiModelSeries","fiModelDescriptor","ProductSize","ProductGroup","Drive_System","Enclosure","Forks","Pad_Type","Ride_Control","Stick","Transmission","Turbocharged","Blade_Extension","Blade_Width","Enclosure_Type","Engine_Horsepower","Hydraulics","Pushblock","Ripper","Scarifier","Tip_ControlCoupler","Coupler_System","Grouser_Tracks","Hydraulics_Flow","Track_Type","Thumb","Pattern_Changer","Grouser_Type","Backhoe_Mounting","Blade_Type","Travel_Controls","Differential_Type","Steering_Controls"]
	
	#"auctioneerID","state","ProductGroupDesc",,"fiSecondaryDesc"
	# this is redundant "fiModelDesc", and has too many options...
	
	# Q, AC, AL AR AS
	
	colDict = {}
	for col in parseColumns:
		colDict[col] = []
		
	colMap = {}	
	notInTest = []
	for index, col in enumerate(train.columns):
		print "MAP:", col, index
		colMap[col] = index
		if col in parseColumns:
			#print "start"			
			s = set(x for x in train[col].fillna(0)) # 0 if x == "" or not isinstance(x, float) else x
			s.update(x for x in test[col].fillna(0)) # math.isnan(x)
			
			colDict[col] = s
			print s
			
			if col == "fiBaseModel":
				a = set(x for x in train[col].fillna(0))
				b = set(x for x in test[col].fillna(0))		
				print "fiBaseModel"
				print
				print
				# found 11 type in test not in train
				print [x for x in b if x not in a]
				print
				print
				# found several hundred in train that are not in test, try dropping these...
				print [x for x in a if x not in b]
				notInTest = [x for x in a if x not in b]

				
	SaleIDArr = []		
	trainSalePriceArr = []

	count = 0
	csv_io.delete_file("train1.csv")
	for row in train.iterrows():
		trainSalePrice = []
	
		rowVals = row[1].fillna(0)
		newSet = []
		newRow = []
		
		if rowVals["fiBaseModel"] not in notInTest:
			continue
		
		trainSalePrice.append(rowVals["SalePrice"])
		trainSalePriceArr.append(trainSalePrice)
		
		SaleID = []
		SaleID.append(rowVals["SalesID"])
		SaleIDArr.append(SaleID)
		
		for col in colDict.keys():
			for val in colDict[col]:
				if val == rowVals[col] :
					newRow.append(1)
				else:
					newRow.append(0)

		#newRow.append(rowVals["YearMade"]) # need to calculate age, sale date minus year
		newRow.append(rowVals["MachineHoursCurrentMeter"])
		
		count += 1
		if count % 10000 == 0:
			print "Count", count
			
		newSet.append(newRow)
		csv_io.write_delimited_file("train1.csv", newSet ,header=None, delimiter=",", filemode="a")

		
	csv_io.write_delimited_file("target.csv", trainSalePriceArr ,header=None, delimiter=",")
	csv_io.write_delimited_file("train_salesID.csv", SaleIDArr ,header=None, delimiter=",")		
	# -------------------------------------------	
	
	SaleIDArr = []
	
	count = 0
	csv_io.delete_file("test1.csv")
	for row in test.iterrows():

		rowVals = row[1].fillna(0)
		newSet = []
		newRow = []
		
		SaleID = []
		SaleID.append(rowVals["SalesID"])
		SaleIDArr.append(SaleID)
		
		for col in colDict.keys():
			for val in colDict[col]:
				if val == rowVals[col] :
					newRow.append(1)
				else:
					newRow.append(0)

		#newRow.append(rowVals["YearMade"]) # need to calculate age, sale date minus year
		newRow.append(rowVals["MachineHoursCurrentMeter"])
		
		count += 1
		if count % 10000 == 0:
			print "Count", count
			
		newSet.append(newRow)
		csv_io.write_delimited_file("test1.csv", newSet ,header=None, delimiter=",", filemode="a")
	
	csv_io.write_delimited_file("test_salesID.csv", SaleIDArr ,header=None, delimiter=",")		
	


if __name__=="__main__":
	preprocess()
Example #49
0
def run_stack(SEED):

	model = "Long-Lat KNN5 - 50 Features"

	print "Running GB, RF, ET stack."

	trainBase = csv_io.read_data("PreProcessData/training_PreProcess4_50.csv", skipFirstLine = False, split = "\t")
	test = csv_io.read_data("PreProcessData/test_PreProcess4_50.csv", skipFirstLine = False, split = "\t")
	weights = csv_io.read_data("PreProcessData/Weights.csv", skipFirstLine = False)

	
	#random.seed(SEED)
	#random.shuffle(trainBase)
	
	avg = 0
	NumFolds = 5 # 5 is good, but 10 yeilds a better mean since outliers are less significant. (note, predictions are less reliable when using 10).


	predicted_list = []
	bootstrapLists = []

	# use this for quick runs.
	# note RF with 150 crashes on 30 features
	# GradientBoostingRegressor(learn_rate=0.05, subsample=0.5, max_depth=6, n_estimators=50),
	# GradientBoostingRegressor(learn_rate=0.02, subsample=0.2, max_depth=8, n_estimators=125),
	# RandomForestRegressor(n_estimators=100, n_jobs=1),
	#RandomForestRegressor(n_estimators=75, n_jobs=1),
	# clfs = [ExtraTreesRegressor(n_estimators=50, min_density=0.2, n_jobs=1, bootstrap=True, random_state=1),
		# SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0, degree=3, gamma=2**-5.5, kernel='rbf', probability=False, shrinking=True, tol=0.001, verbose=False)
		# ]	
	#knn 5 at 3.45
	#knn 15 at 3.31
	#knn 25 at 3.30
	#knn 40 at 3.31
	# KNeighborsRegressor(n_neighbors=5, weights='uniform', algorithm='auto', leaf_size=30, warn_on_equidistant=False, p=2),
	# KNeighborsRegressor(n_neighbors=15, weights='uniform', algorithm='auto', leaf_size=30, warn_on_equidistant=False, p=2),
	# KNeighborsRegressor(n_neighbors=25, weights='uniform', algorithm='auto', leaf_size=30, warn_on_equidistant=False, p=2),
	# LinearRegression at 3.77
	# Ridge at 3.77
	# SGD 4.23
	#Gauss at 13
	# LinearRegression(fit_intercept=True, normalize=False, copy_X=True),
	# Ridge(alpha=1.0, fit_intercept=True, normalize=False, copy_X=True, tol=0.001),
	# SGDRegressor(loss='squared_loss', penalty='l2', alpha=0.0001, rho=0.84999999999999998, fit_intercept=True, n_iter=5, shuffle=False, verbose=0, epsilon=0.10000000000000001, p=None, seed=0, learning_rate='invscaling', eta0=0.01, power_t=0.25, warm_start=False),
	# GaussianNB()
	# clfs = [KNeighborsRegressor(n_neighbors=15, weights='uniform', algorithm='auto', leaf_size=30, warn_on_equidistant=False, p=2),
		 # KNeighborsRegressor(n_neighbors=25, weights='uniform', algorithm='auto', leaf_size=30, warn_on_equidistant=False, p=2),KNeighborsRegressor(n_neighbors=35, weights='uniform', algorithm='auto', leaf_size=30, warn_on_equidistant=False, p=2)	
		# ]
		
	# GB, 125 est is minimum, score is bad below this, explore higher and other dimensions. ******************
	# clfs = [GradientBoostingRegressor(learn_rate=0.05, subsample=0.5, max_depth=6, n_estimators=100, random_state=166),
			# GradientBoostingRegressor(learn_rate=0.05, subsample=0.5, max_depth=8, n_estimators=100, random_state=166),
			# GradientBoostingRegressor(learn_rate=0.05, subsample=0.5, max_depth=10, n_estimators=100, random_state=166),
			# GradientBoostingRegressor(learn_rate=0.05, subsample=0.5, max_depth=6, n_estimators=200, random_state=166),
			# GradientBoostingRegressor(learn_rate=0.05, subsample=0.5, max_depth=8, n_estimators=200, random_state=166),GradientBoostingRegressor(learn_rate=0.05, subsample=0.5, max_depth=10, n_estimators=200, random_state=166)
			# ]	
			
	# about 1 hour run time, and 3.10 score.		
	#GradientBoostingRegressor(learn_rate=0.05, subsample=0.5, max_depth=6, n_estimators=400, random_state=166)
	# about 2 hours run time at 3.05
	#GradientBoostingRegressor(learn_rate=0.05, subsample=0.5, max_depth=8, n_estimators=400, random_state=166)
	# about 2 hours run time at 3.06
	#GradientBoostingRegressor(learn_rate=0.05, subsample=0.5, max_depth=6, n_estimators=800, random_state=166)
	# about 4 hours run time at 3.06
	#GradientBoostingRegressor(learn_rate=0.05, subsample=0.5, max_depth=8, n_estimators=800, random_state=166)	
	
	clfs = [GradientBoostingRegressor(learn_rate=0.05, subsample=0.5, max_depth=6, n_estimators=3000, random_state=166)
			]		
	
	
		# use this for quick runs.
	# clfs = [GradientBoostingClassifier(learn_rate=0.05, subsample=0.5, max_depth=6, n_estimators=50, random_state=166),
			# GradientBoostingClassifier(learn_rate=0.02, subsample=0.2, max_depth=8, n_estimators=125, random_state=551),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=80, random_state=441),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=3, n_estimators=80, random_state=331),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=4, n_estimators=80, random_state=221),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=120, random_state=91),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=3, n_estimators=120, random_state=81),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=4, n_estimators=120, random_state=71),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=160, random_state=61),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=3, n_estimators=160, random_state=51),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=4, n_estimators=160, random_state=41),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=1, n_estimators=200, random_state=31),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=200, random_state=21),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=3, n_estimators=200, random_state=10),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=4, n_estimators=200, random_state=19),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=1, n_estimators=240, random_state=18),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=240, random_state=17),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=3, n_estimators=240, random_state=16),			
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=1, n_estimators=280, random_state=15),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=280, random_state=14),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=3, n_estimators=280, random_state=13),			
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=1, n_estimators=320, random_state=12),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=320, random_state=11),
			# RandomForestClassifier(n_estimators=100, n_jobs=1, criterion='gini'),
			# RandomForestClassifier(n_estimators=100, n_jobs=1, criterion='entropy'),
			# ExtraTreesClassifier(n_estimators=150, min_density=0.2, n_jobs=1, criterion='gini', bootstrap=True, random_state=1),
			# ExtraTreesClassifier(n_estimators=150, min_density=0.2, n_jobs=1, criterion='entropy', bootstrap=True, random_state=5)]	
	
	
	
	# use this for quick runs.  reduced estimators to 50
	# clfs = [SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0, degree=3,
        # gamma=2**-5.5, kernel='rbf', probability=False, shrinking=True,
        # tol=0.001, verbose=False)
			# ]	
			
	#GradientBoostingRegressor(learn_rate=0.05, subsample=0.5, max_depth=6, n_estimators=50),
	#ExtraTreesRegressor(n_estimators=50, min_density=0.2, n_jobs=1, bootstrap=True, random_state=1)
	
	# clfs = [ExtraTreesClassifier(n_estimators=150, min_density=0.2, n_jobs=1, criterion='gini', bootstrap=True, random_state=1),
			# ExtraTreesClassifier(n_estimators=150, min_density=0.09, n_jobs=1, criterion='gini', bootstrap=True, random_state=2),
			# ExtraTreesClassifier(n_estimators=150, min_density=0.2, n_jobs=1, criterion='entropy', bootstrap=True, random_state=5),
			# ExtraTreesClassifier(n_estimators=150, min_density=0.09, n_jobs=1, criterion='entropy', bootstrap=True, random_state=6),
			# GradientBoostingClassifier(learn_rate=0.05, subsample=0.5, max_depth=6, n_estimators=50),
			# GradientBoostingClassifier(learn_rate=0.02, subsample=0.2, max_depth=8, n_estimators=125),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=80),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=3, n_estimators=80),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=4, n_estimators=80),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=120),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=3, n_estimators=120),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=4, n_estimators=120),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=160),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=3, n_estimators=160),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=4, n_estimators=160),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=1, n_estimators=200),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=200),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=3, n_estimators=200),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=4, n_estimators=200),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=1, n_estimators=240),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=240),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=3, n_estimators=240),			
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=1, n_estimators=280),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=280),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=3, n_estimators=280),			
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=1, n_estimators=320),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=320),
			# RandomForestClassifier(n_estimators=150, min_density=0.2, n_jobs=1, criterion='gini', bootstrap=True, random_state=1),
			# RandomForestClassifier(n_estimators=150, min_density=0.09, n_jobs=1, criterion='gini', bootstrap=True, random_state=2),
			# RandomForestClassifier(n_estimators=150, min_density=0.2, n_jobs=1, criterion='entropy', bootstrap=True, random_state=5),
			# RandomForestClassifier(n_estimators=150, min_density=0.05, n_jobs=1, criterion='entropy', bootstrap=True, random_state=7)]
			
			
	# full algorithm stack.
	# clfs = [ExtraTreesClassifier(n_estimators=150, min_density=0.2, n_jobs=1, criterion='gini', bootstrap=True, random_state=1),
			# ExtraTreesClassifier(n_estimators=150, min_density=0.09, n_jobs=1, criterion='gini', bootstrap=True, random_state=2),
			# ExtraTreesClassifier(n_estimators=150, min_density=0.05, n_jobs=1, criterion='gini', bootstrap=True, random_state=3),
			# ExtraTreesClassifier(n_estimators=150, min_density=0.02, n_jobs=1, criterion='gini', bootstrap=True, random_state=4),
			# ExtraTreesClassifier(n_estimators=150, min_density=0.2, n_jobs=1, criterion='entropy', bootstrap=True, random_state=5),
			# ExtraTreesClassifier(n_estimators=150, min_density=0.09, n_jobs=1, criterion='entropy', bootstrap=True, random_state=6),
			# ExtraTreesClassifier(n_estimators=150, min_density=0.05, n_jobs=1, criterion='entropy', bootstrap=True, random_state=7),
			# ExtraTreesClassifier(n_estimators=150, min_density=0.02, n_jobs=1, criterion='entropy', bootstrap=True, random_state=8),
			# GradientBoostingClassifier(learn_rate=0.05, subsample=0.5, max_depth=6, n_estimators=50),
			# GradientBoostingClassifier(learn_rate=0.02, subsample=0.2, max_depth=8, n_estimators=125),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=80),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=3, n_estimators=80),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=4, n_estimators=80),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=120),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=3, n_estimators=120),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=4, n_estimators=120),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=160),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=3, n_estimators=160),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=4, n_estimators=160),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=1, n_estimators=200),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=200),
			# GradientBoostingClassifier(lesarn_rate=0.01, subsample=0.2, max_depth=3, n_estimators=200),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=4, n_estimators=200),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=1, n_estimators=240),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=240),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=3, n_estimators=240),			
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=1, n_estimators=280),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=280),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=3, n_estimators=280),			
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=1, n_estimators=320),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=320),
			# RandomForestClassifier(n_estimators=150, min_density=0.2, n_jobs=1, criterion='gini', bootstrap=True, random_state=1),
			# RandomForestClassifier(n_estimators=150, min_density=0.09, n_jobs=1, criterion='gini', bootstrap=True, random_state=2),
			# RandomForestClassifier(n_estimators=150, min_density=0.05, n_jobs=1, criterion='gini', bootstrap=True, random_state=3),
			# RandomForestClassifier(n_estimators=150, min_density=0.02, n_jobs=1, criterion='gini', bootstrap=True, random_state=4),
			# RandomForestClassifier(n_estimators=150, min_density=0.2, n_jobs=1, criterion='entropy', bootstrap=True, random_state=5),
			# RandomForestClassifier(n_estimators=150, min_density=0.09, n_jobs=1, criterion='entropy', bootstrap=True, random_state=6),
			# RandomForestClassifier(n_estimators=150, min_density=0.05, n_jobs=1, criterion='entropy', bootstrap=True, random_state=7),
			# RandomForestClassifier(n_estimators=150, min_density=0.02, n_jobs=1, criterion='entropy', bootstrap=True, random_state=8)]
	

	
	print "Data size: ", len(trainBase), len(test)
	dataset_blend_train = np.zeros((len(trainBase), len(clfs)))
	dataset_blend_test = np.zeros((len(test), len(clfs)))
	

	trainNew = []
	trainTestNew = []
	testNew = []
	trainNewSelect = []
	trainTestNewSelect = []
	testNewSelect = []
	
	print "Scaling"
	targetPre = [x[0] for x in trainBase]
	trainPre = [x[1:] for x in trainBase]
	testPre = [x[0:] for x in test]
	#print trainPre[0]
	scaler = preprocessing.Scaler().fit(trainPre)
	trainScaled = scaler.transform(trainPre)
	testScaled = scaler.transform(testPre)	

	#print scaler.mean_
	#print scaler.std_
	print "Begin Training"
	
	
	for ExecutionIndex, clf in enumerate(clfs):
		print str(clf)
		avg = 0
	
		predicted_list = []
			
		dataset_blend_test_set = np.zeros((len(test), NumFolds))

		
		foldCount = 0

		
		#Stratified for classification...[trainBase[i][0] for i in range(len(trainBase))]
		Folds = cross_validation.KFold(len(trainBase), k=NumFolds, indices=True)
		for train_index, test_index in Folds:

			#trainBaseTemp = [trainBase[i] for i in train_index]
			#target = [x[0] for x in trainBaseTemp]
			#train = [x[1:] for x in trainBaseTemp]
	
			#testBaseTemp = [trainBase[i] for i in test_index]
			#targetTest = [x[0] for x in testBaseTemp]
			#trainTest = [x[1:] for x in testBaseTemp]
		
			#test = [x[0:] for x in test]
	
			target = [targetPre[i] for i in train_index]
			train = [trainScaled[i] for i in train_index]
			
			targetTest = [targetPre[i] for i in test_index]	
			trainTest = [trainScaled[i] for i in test_index]	
	
			print
			print "Iteration: ", foldCount
			print "LEN: ", len(train), len(target)
			
			clf.fit(train, target)
			prob = clf.predict(trainTest) 
			
			dataset_blend_train[test_index, ExecutionIndex] = prob



	
			probSum = 0
			weightSum = 0
			# totalOffByHalf = 0
			# totalPositive = 0
			# totalPositiveOffByHalf = 0
			# totalPositivePredictions = 0
			
			for i in range(0, len(prob)):
				probX = prob[i]

				probSum += weights[test_index[i]][0] * math.fabs(targetTest[i] - probX)
				weightSum += weights[test_index[i]][0] 
				#print "Weight", weights[test_index[i]][0], "Index: ",i, "Test_Index: ",test_index[i] , "Actual: ", targetTest[i], "Predicted: ", probX
				
				# log loss cal
				#probSum += int(targetTest[i])*log(probX)+(1-int(targetTest[i]))*log(1-probX)
				# if ( math.fabs(probX - int(targetTest[i])) > 0.5 ):
					# totalOffByHalf = totalOffByHalf + 1		
			
				# if ( int(targetTest[i]) == 1 ):
					# totalPositive = totalPositive + 1
				# if ( int(targetTest[i]) == 1 and probX < 0.5):
					# totalPositiveOffByHalf = totalPositiveOffByHalf + 1
				# if (probX > 0.5):
					# totalPositivePredictions = totalPositivePredictions + 1			
			
			# print
			# print "Stats:"
			# print "Total Off By > 0.5 ", totalOffByHalf
			# print "Total Positive ", totalPositive
			# print "Total Positive Off By Half ", totalPositiveOffByHalf
			# print "Total Positive Predictions ", totalPositivePredictions
			#print -probSum/len(prob)
			print "Score: ", probSum/weightSum
 
			avg += 	(probSum/weightSum)/NumFolds

			predicted_probs = clf.predict(testScaled) 	
			#predicted_list.append([x[1] for x in predicted_probs])	
			dataset_blend_test_set[:, foldCount] = predicted_probs #[0]
		
				
			foldCount = foldCount + 1
		
		dataset_blend_test[:,ExecutionIndex] = dataset_blend_test_set.mean(1)  
		
		#print "Saving NP"
		#np.savetxt('temp/dataset_blend_test_set.txt', dataset_blend_test_set)
		#np.savetxt('temp/dataset_blend_test_set.mean.txt', dataset_blend_test_set.mean(1) )
		#np.savetxt('temp/dataset_blend_test.txt', dataset_blend_test)
		#print "Done Saving NP"
		
		now = datetime.datetime.now()
		#print dataset_blend_test_set.mean(1) 
		csv_io.write_delimited_file_single("../predictions_50/Stack_" + now.strftime("%Y%m%d%H%M%S") + "_" + str(avg) + "_" + str(clf)[:12] + ".csv", dataset_blend_test_set.mean(1))
		
		csv_io.write_delimited_file_single("../predictions_50/Target_Stack_" + now.strftime("%Y%m%d%H%M%S") + "_" + str(avg) + "_" + str(clf)[:12] + ".csv", dataset_blend_train[:,ExecutionIndex] )		
		
		csv_io.write_delimited_file("../predictions_40/RunLog.csv", [now.strftime("%Y %m %d %H %M %S"), str(avg), str(clf), str(NumFolds), model, "", ""], filemode="a",delimiter=",")
		
		
		print now
		print "------------------------Average: ", avg

		#np.savetxt('temp/dataset_blend_train.txt', dataset_blend_train)

	return dataset_blend_train, dataset_blend_test
Example #50
0
def run_stack(SEED):

    train, test = util.get_train_test_df()

    columns = set(train.columns)
    columns.remove("SalesID")
    columns.remove("SalePrice")
    columns.remove("saledate")

    train_fea = get_date_dataframe(train["saledate"])
    test_fea = get_date_dataframe(test["saledate"])

    for col in columns:
        types = set(type(x) for x in train[col])
        if str in types:
            s = set(x for x in train[col])
            str_to_categorical = defaultdict(lambda: -1,
                                             [(x[1], x[0])
                                              for x in enumerate(s)])
            train_fea = train_fea.join(
                pd.DataFrame(
                    {col: [str_to_categorical[x] for x in train[col]]},
                    index=train.index))
            test_fea = test_fea.join(
                pd.DataFrame({col: [str_to_categorical[x] for x in test[col]]},
                             index=test.index))
        else:
            train_fea = train_fea.join(train[col])
            test_fea = test_fea.join(test[col])

    model = ""
    print "Running Stack."

    avg = 0
    NumFolds = 5  # 5 is good, but 10 yeilds a better mean since outliers are less significant.

    #targetX = csv_io.read_data("target.csv", skipFirstLine = False, split = ",")
    #trainBase = csv_io.read_data("train1.csv", skipFirstLine = False, split = ",")
    #test = csv_io_np.read_data("test1.csv", skipFirstLine = False, split = ",")

    #trainBase = trainBase[0:5000]
    #targetX = targetX[0:5000]

    #train_saleID = csv_io.read_data("train_salesID.csv", skipFirstLine = False, split = ",")
    #test_salesID = csv_io.read_data("test_salesID.csv", skipFirstLine = False, split = ",")

    predicted_list = []
    bootstrapLists = []

    clfs = [
        GradientBoostingRegressor(loss='lad',
                                  learn_rate=0.05,
                                  subsample=0.5,
                                  max_depth=6,
                                  n_estimators=3000,
                                  random_state=166,
                                  min_samples_leaf=1)
    ]
    #GradientBoostingRegressor(loss='ls', learn_rate=0.05, subsample=0.5, max_depth=6, n_estimators=300, random_state=166, min_samples_leaf=1)
    #GradientBoostingRegressor(loss='lad', learn_rate=0.05, subsample=0.5, max_depth=6, n_estimators=3000, random_state=166, min_samples_leaf=1),
    #GradientBoostingRegressor(loss='huber', learn_rate=0.05, subsample=0.5, max_depth=6, n_estimators=3000, random_state=166, min_samples_leaf=1),

    #train_fea, train["SalePrice"]
    print "Data size: ", len(train_fea), len(test_fea)
    #dataset_blend_train = np.zeros((len(train_fea), len(clfs)))
    #dataset_blend_test = np.zeros((len(test), len(clfs)))
    dataset_blend_test = np.zeros(
        (len(test_fea), len(clfs)))  # np.zeros(len(train_fea), len(clfs))
    dataset_blend_train = np.zeros((len(train_fea), len(clfs)))

    print "Begin Training"

    lenTrainBase = 401125  # len(train_fea)

    lenTest = 11573  # len(test_fea)
    gc.collect()

    for ExecutionIndex, clf in enumerate(clfs):
        print clf
        avg = 0

        predicted_list = []

        dataset_blend_test_set = np.zeros((lenTest, NumFolds))

        foldCount = 0

        #Stratified for classification...[trainBase[i][0] for i in range(len(trainBase))]
        Folds = cross_validation.KFold(lenTrainBase, k=NumFolds, indices=True)

        for train_index, test_index in Folds:

            targetX = [train["SalePrice"][i] for i in train_index]
            trainX = [train_fea.ix[i] for i in train_index]

            targetTest = [train["SalePrice"][i] for i in test_index]
            trainTest = [train_fea.ix[i] for i in test_index]

            gc.collect()
            print
            print "Iteration: ", foldCount
            print "LEN: ", len(trainX), len(targetX)

            #print trainX[0]
            #print target[0]
            #return

            print "Start", datetime.datetime.now()
            clf.fit(trainX, targetX)
            prob = clf.predict(trainTest)
            print "End  ", datetime.datetime.now()

            dataset_blend_train[test_index, ExecutionIndex] = prob

            gc.collect()

            probSum = 0
            weightSum = 0
            # totalOffByHalf = 0
            # totalPositive = 0
            # totalPositiveOffByHalf = 0
            # totalPositivePredictions = 0

            for i in range(0, len(prob)):
                probX = prob[i]
                #print targetTest[i], probX

                if probX < 0:  # some are comming out negative.
                    probX = -probX

                probSum += math.pow(
                    math.log10(targetTest[i]) - math.log10(probX), 2)

                #probSum += weights[test_index[i]][0] * math.fabs(targetTest[i] - probX)
                #weightSum += weights[test_index[i]][0]

                #print "Weight", weights[test_index[i]][0], "Index: ",i, "Test_Index: ",test_index[i] , "Actual: ", targetTest[i], "Predicted: ", probX

                # log loss cal
                #probSum += int(targetTest[i])*log(probX)+(1-int(targetTest[i]))*log(1-probX)
                # if ( math.fabs(probX - int(targetTest[i])) > 0.5 ):
                # totalOffByHalf = totalOffByHalf + 1

                # if ( int(targetTest[i]) == 1 ):
                # totalPositive = totalPositive + 1
                # if ( int(targetTest[i]) == 1 and probX < 0.5):
                # totalPositiveOffByHalf = totalPositiveOffByHalf + 1
                # if (probX > 0.5):
                # totalPositivePredictions = totalPositivePredictions + 1

            # print
            # print "Stats:"
            # print "Total Off By > 0.5 ", totalOffByHalf
            # print "Total Positive ", totalPositive
            # print "Total Positive Off By Half ", totalPositiveOffByHalf
            # print "Total Positive Predictions ", totalPositivePredictions
            #print -probSum/len(prob)
            print "Score: ", math.sqrt(probSum / len(prob))

            avg += math.sqrt(probSum / len(prob)) / NumFolds

            gc.collect()

            predicted_probs = []

            for i in range(0, lenTest):
                predicted_probs.append(clf.predict(test_fea.ix[i]))

            #predicted_probs = clf.predict(testScaled)
            #predicted_list.append([x[1] for x in predicted_probs])
            dataset_blend_test_set[:, foldCount] = predicted_probs  #[0]
            gc.collect()

            foldCount = foldCount + 1

        dataset_blend_test[:, ExecutionIndex] = dataset_blend_test_set.mean(1)

        #print "Saving NP"
        #np.savetxt('temp/dataset_blend_test_set.txt', dataset_blend_test_set)
        #np.savetxt('temp/dataset_blend_test_set.mean.txt', dataset_blend_test_set.mean(1) )
        #np.savetxt('temp/dataset_blend_test.txt', dataset_blend_test)
        #print "Done Saving NP"

        now = datetime.datetime.now()
        #print dataset_blend_test_set.mean(1)
        csv_io.write_delimited_file_single(
            "../predictions/Stack_" + now.strftime("%Y%m%d%H%M%S") + "_" +
            str(avg) + "_" + str(clf)[:12] + ".csv",
            dataset_blend_test_set.mean(1))

        csv_io.write_delimited_file_single(
            "../predictions/Target_Stack_" + now.strftime("%Y%m%d%H%M%S") +
            "_" + str(avg) + "_" + str(clf)[:12] + ".csv",
            dataset_blend_train[:, ExecutionIndex])

        csv_io.write_delimited_file("../predictions/RunLog.csv", [
            now.strftime("%Y %m %d %H %M %S"), "AVG.",
            str(avg),
            str(clf), "Folds:",
            str(NumFolds), "Model", model, "", ""
        ],
                                    filemode="a",
                                    delimiter=",")

        print "------------------------Average: ", avg

        #np.savetxt('temp/dataset_blend_train.txt', dataset_blend_train)

    return dataset_blend_train, dataset_blend_test
Example #51
0
def preprocess():

	train, test = util.get_train_test_df()

	
	columns = set(train.columns)
	#columns.remove("SalesID")
	#columns.remove("SalePrice")
	#columns.remove("saledate")

	#train_fea = get_date_dataframe(train["saledate"])
	#test_fea = get_date_dataframe(test["saledate"])

	#parseColumns = ["UsageBand"]
	parseColumns = [ "UsageBand","fiBaseModel","fiModelSeries","fiModelDescriptor","ProductSize","ProductGroup","Drive_System","Enclosure","Forks","Pad_Type","Ride_Control","Stick","Transmission","Turbocharged","Blade_Extension","Blade_Width","Enclosure_Type","Engine_Horsepower","Hydraulics","Pushblock","Ripper","Scarifier","Tip_ControlCoupler","Coupler_System","Grouser_Tracks","Hydraulics_Flow","Track_Type","Thumb","Pattern_Changer","Grouser_Type","Backhoe_Mounting","Blade_Type","Travel_Controls","Differential_Type","Steering_Controls"]
	
	#"auctioneerID","state","ProductGroupDesc",,"fiSecondaryDesc"
	# this is redundant "fiModelDesc", and has too many options...
	
	# Q, AC, AL AR AS
	
	colDict = {}
	for col in parseColumns:
		colDict[col] = []
		
	colMap = {}	
	notInTest = []
	for index, col in enumerate(train.columns):
		print "MAP:", col, index
		colMap[col] = index
		if col in parseColumns:
			#print "start"			
			s = set(x for x in train[col].fillna(0)) # 0 if x == "" or not isinstance(x, float) else x
			s.update(x for x in test[col].fillna(0)) # math.isnan(x)
			
			colDict[col] = s
			print s
			
			if col == "fiBaseModel":
				a = set(x for x in train[col].fillna(0))
				b = set(x for x in test[col].fillna(0))		
				print "fiBaseModel"
				print
				print
				# found 11 type in test not in train
				print [x for x in b if x not in a]
				print
				print
				# found several hundred in train that are not in test, try dropping these...
				print [x for x in a if x not in b]
				notInTest = [x for x in a if x not in b]

				
	SaleIDArr = []		
	trainSalePriceArr = []

	count = 0
	csv_io.delete_file("train1.csv")
	for row in train.iterrows():
		trainSalePrice = []
	
		rowVals = row[1].fillna(0)
		newSet = []
		newRow = []
		
		if rowVals["fiBaseModel"] not in notInTest:
			continue
		
		trainSalePrice.append(rowVals["SalePrice"])
		trainSalePriceArr.append(trainSalePrice)
		
		SaleID = []
		SaleID.append(rowVals["SalesID"])
		SaleIDArr.append(SaleID)
		
		for col in colDict.keys():
			for val in colDict[col]:
				if val == rowVals[col] :
					newRow.append(1)
				else:
					newRow.append(0)

		#newRow.append(rowVals["YearMade"]) # need to calculate age, sale date minus year
		newRow.append(rowVals["MachineHoursCurrentMeter"])
		
		count += 1
		if count % 10000 == 0:
			print "Count", count
			
		newSet.append(newRow)
		csv_io.write_delimited_file("train1.csv", newSet ,header=None, delimiter=",", filemode="a")

		
	csv_io.write_delimited_file("target.csv", trainSalePriceArr ,header=None, delimiter=",")
	csv_io.write_delimited_file("train_salesID.csv", SaleIDArr ,header=None, delimiter=",")		
	# -------------------------------------------	
	
	SaleIDArr = []
	
	count = 0
	csv_io.delete_file("test1.csv")
	for row in test.iterrows():

		rowVals = row[1].fillna(0)
		newSet = []
		newRow = []
		
		SaleID = []
		SaleID.append(rowVals["SalesID"])
		SaleIDArr.append(SaleID)
		
		for col in colDict.keys():
			for val in colDict[col]:
				if val == rowVals[col] :
					newRow.append(1)
				else:
					newRow.append(0)

		#newRow.append(rowVals["YearMade"]) # need to calculate age, sale date minus year
		newRow.append(rowVals["MachineHoursCurrentMeter"])
		
		count += 1
		if count % 10000 == 0:
			print "Count", count
			
		newSet.append(newRow)
		csv_io.write_delimited_file("test1.csv", newSet ,header=None, delimiter=",", filemode="a")
	
	csv_io.write_delimited_file("test_salesID.csv", SaleIDArr ,header=None, delimiter=",")		
Example #52
0
		print "Score: ", auc
			

		avg += 	auc/NumFolds

		predicted_probs = clf.predict_proba(finalTestSparse) 	
		#predicted_list.append([x[1] for x in predicted_probs])	
		dataset_blend_test_set[:, foldCount] = predicted_probs[:,1]
		
				
		foldCount = foldCount + 1
		
		#break	
		
	dataset_blend_test[:,ExecutionIndex] = dataset_blend_test_set.mean(1)  
		

		
	now = datetime.datetime.now()

	csv_io.write_delimited_file_single_plus_index("../predictions/Stack_" + now.strftime("%Y%m%d%H%M%S") + "_" + str(avg) + "_" + str(clf)[:12] + ".csv", dataset_blend_test_set.mean(1))
	csv_io.write_delimited_file_single("../predictions/Target_Stack_" + now.strftime("%Y%m%d%H%M%S") + "_" + str(avg) + "_" + str(clf)[:12] + ".csv", dataset_blend_train[:,ExecutionIndex] )		
	csv_io.write_delimited_file("../predictions/RunLog.csv", [now.strftime("%Y %m %d %H %M %S"), str(avg), str(clf), str(NumFolds), model, "", ""], filemode="a",delimiter=",")
		
		
	print "------------------------Average: ", avg




Example #53
0
def run_stack(SEED):

    model = "Lasso"


    trainBaseTarget = pd.read_csv('../data/pre_shuffled_target.csv')
    trainBase = pd.read_csv('../models/' + model + '_train.csv')
    trainBaseWeight = trainBase['var11']
    test = pd.read_csv('../models/' + model + '_test.csv')


    #trainBase = shuffle(trainBase, random_state = SEED)

    print(trainBase.columns)
    trainBaseID = trainBase['id']
    testID = test['id']    

    
    trainBase = np.nan_to_num(np.array(trainBase))
    targetBase = np.nan_to_num(np.array(trainBaseTarget))
    test = np.nan_to_num(np.array(test))
    
    
    avg = 0
    NumFolds = 5



    #GradientBoostingRegressor(loss='ls', learning_rate=0.05, subsample=0.5, max_depth=10, n_estimators=30, random_state=166, min_samples_leaf=1),    
        #Ridge()
    clfs = [
        LinearRegression(fit_intercept=True, normalize=False, copy_X=True)
        #BaggingRegressor(base_estimator=Ridge(), n_estimators=500, max_samples=1.0, max_features=1.0, bootstrap=True, bootstrap_features=False, oob_score=False, n_jobs=1, random_state=None, verbose=0)
        #AdaBoostRegressor(base_estimator=Ridge(), n_estimators=50, learning_rate=1.0, loss='linear', random_state=None)
        #Lasso(alpha=0.0000329034456231),
        #Ridge(),
        #RandomForestRegressor(n_estimators=3000, criterion='mse', max_depth=None, min_samples_split=2, min_samples_leaf=1, max_features='auto', max_leaf_nodes=None, bootstrap=True, oob_score=False, n_jobs=1, random_state=None, verbose=0, min_density=None, compute_importances=None),
        #GradientBoostingRegressor(loss='ls', learning_rate=0.05, subsample=0.5, max_depth=10, n_estimators=30, random_state=166, min_samples_leaf=1),
        #GradientBoostingRegressor(loss='ls', learning_rate=0.05, subsample=0.5, max_depth=10, n_estimators=100, random_state=166, min_samples_leaf=1),
        #GradientBoostingRegressor(loss='ls', learning_rate=0.05, subsample=0.5, max_depth=10, n_estimators=300, random_state=166, min_samples_leaf=1),
        #GradientBoostingRegressor(loss='ls', learning_rate=0.05, subsample=0.5, max_depth=10, n_estimators=1000, random_state=166, min_samples_leaf=1),
        #GradientBoostingRegressor(loss='ls', learning_rate=0.05, subsample=0.5, max_depth=10, n_estimators=3000, random_state=166, min_samples_leaf=1),
    ]        
    
    
    
    print ("Data size: " + str(len(trainBase)) + " " + str(len(test)))
    dataset_blend_train = np.zeros((len(trainBase), len(clfs)))
    dataset_blend_test = np.zeros((len(test), len(clfs)))
    


    
    print("Begin Training")
    
    lenTrainBase = len(trainBase)
    lenTest = len(test)
    
    

    gc.collect()
    
    for ExecutionIndex, clf in enumerate(clfs):
        print(clf)
        avg = 0
    

            
        dataset_blend_test_set = np.zeros((lenTest, NumFolds))

        
        foldCount = 0

        Folds = cross_validation.KFold(lenTrainBase, n_folds=NumFolds, indices=True)
            
        for train_index, test_index in Folds:
    
            print()
            print ("Iteration: " + str(foldCount))
            
            
            now = datetime.datetime.now()
            print(now.strftime("%Y/%m/%d %H:%M:%S"))    
    
    
            target = [targetBase[i] for i in train_index]
            train = [trainBase[i] for i in train_index]
            weight = [trainBaseWeight[i] for i in train_index]
            
            targetTest = [targetBase[i] for i in test_index]    
            trainTest = [trainBase[i] for i in test_index]    
            weightTest = [trainBaseWeight[i] for i in test_index]
            

            #print "LEN: ", len(train), len(target)
            
            
            target = np.array(np.reshape(target, (-1, 1)) )           
            #train = np.array(np.reshape(train, (-1, 1))  ) 
            weight = np.array(np.reshape(weight, (-1, 1)))              
    
            targetTest = np.array(np.reshape(targetTest, (-1, 1)) )  
            #trainTest = np.array(np.reshape(trainTest, (-1, 1)) )  
            weightTest = np.array(np.reshape(weightTest, (-1, 1)))              
            

            #clf.fit(train, target, sample_weight = weight
            clf.fit(train, target)
            predicted = clf.predict(trainTest) 
            #print(predicted[:,0])
            print(test_index)
            dataset_blend_train[test_index, ExecutionIndex] = predicted[:,0] #needed for Ridge

     
            #print(targetTest.shape)
            #print(prpredictedob.shape)
            #print(weightTest.shape)

            print(str(score.normalized_weighted_gini(targetTest.ravel(), predicted.ravel(), weightTest.ravel())))
            avg += score.normalized_weighted_gini(targetTest.ravel(), predicted.ravel(), weightTest.ravel())/NumFolds

            predicted[predicted[:,0] < 0.0] = 0.0        

            print(str(score.normalized_weighted_gini(targetTest.ravel(), predicted.ravel(), weightTest.ravel())))
            avg += score.normalized_weighted_gini(targetTest.ravel(), predicted.ravel(), weightTest.ravel())/NumFolds
       
                 
            predicted = clf.predict(test)         
            dataset_blend_test_set[:, foldCount] = predicted[:,0] 
        

        
                
            foldCount = foldCount + 1
        
   
        
        
        dataset_blend_test[:,ExecutionIndex] = dataset_blend_test_set.mean(1)  
        
    
        now = datetime.datetime.now()
        #print dataset_blend_test_set.mean(1) 
        #csv_io.write_delimited_file_single("../predictions/Stack_" + now.strftime("%Y%m%d%H%M%S") + "_" + str(avg) + "_" + str(clf)[:12] + ".csv", dataset_blend_test_set.mean(1))
        
        submission = pd.DataFrame(np.zeros((len(testID), 2)), columns=['id', 'target'])
        submission['target'] = dataset_blend_test[:,ExecutionIndex]
        submission['id'] = testID
        submission.to_csv("../predictions/Stack_" + now.strftime("%Y%m%d%H%M%S") + "_" + str(avg) + "_" + str(clf)[:12] + ".csv", index = False)
        
        
        #csv_io.write_delimited_file_single("../predictions/Target_Stack_" + now.strftime("%Y%m%d%H%M%S") + "_" + str(avg) + "_" + str(clf)[:12] + ".csv", dataset_blend_train[:,ExecutionIndex] )        
        
        submission = pd.DataFrame(np.zeros((len(trainBaseID), 2)), columns=['id', 'target'])
        submission['target'] = dataset_blend_train[:,ExecutionIndex]
        submission['id'] = trainBaseID
        submission.to_csv("../predictions/Target_Stack_" + now.strftime("%Y%m%d%H%M%S") + "_" + str(avg) + "_" + str(clf)[:12] + ".csv", index = False)
        
        
        csv_io.write_delimited_file("../log/RunLog.csv", [now.strftime("%Y %m %d %H %M %S"), "AVG." , str(avg), str(clf), "Folds:", str(NumFolds), "Model", "", "", ""], filemode="a",delimiter=",")
        
        
        print ("------------------------Average: " + str(avg))

        #np.savetxt('temp/dataset_blend_train.txt', dataset_blend_train)

    return dataset_blend_train, dataset_blend_test
Example #54
0
def run_stack(SEED):

    model = "base"

    trainBase = csv_io_np.read_data("PreProcessData/train.csv",
                                    skipFirstLine=True,
                                    split=",")
    test = csv_io_np.read_data("PreProcessData/test.csv",
                               skipFirstLine=True,
                               split=",")

    print "Data Read Complete"

    avg = 0
    NumFolds = 5

    predicted_list = []
    bootstrapLists = []

    # 100 producted 94%
    # 1000 did not finish in about 5+ hours...
    # 300 about 5 hours, .9691 on first CF
    # learn_rate=0.01, n_estimators=300, subsample=1.0, min_samples_split=30, 0.9386
    #		GradientBoostingClassifier(loss='deviance', learn_rate=0.1, n_estimators=300, subsample=1.0, min_samples_split=30, min_samples_leaf=1, max_depth=5, init=None, random_state=None, max_features=None)
    clfs = [
        SVC(C=1000000,
            kernel='rbf',
            degree=3,
            gamma=0.0000001,
            coef0=0.0,
            shrinking=True,
            probability=False,
            tol=0.001,
            cache_size=200,
            class_weight=None,
            verbose=False)
    ]

    print "Data size: ", len(trainBase), len(test)
    dataset_blend_train = np.zeros((len(trainBase), len(clfs)))
    dataset_blend_test = np.zeros((len(test), len(clfs)))

    trainNew = []
    trainTestNew = []
    testNew = []
    trainNewSelect = []
    trainTestNewSelect = []
    testNewSelect = []

    print "Scaling"
    targetPre = [x[0] for x in trainBase]
    trainPre = [x[1:] for x in trainBase]
    testPre = [x[0:] for x in test]
    #print trainPre[0]
    #scaler = preprocessing.Scaler().fit(trainPre)
    #trainScaled = scaler.transform(trainPre)
    #testScaled = scaler.transform(testPre)
    trainScaled = trainPre
    testScaled = testPre

    #print scaler.mean_
    #print scaler.std_
    print "Begin Training"

    lenTrainBase = len(trainBase)
    trainBase = []

    lenTest = len(test)
    test = []

    trainPre = []
    testPre = []

    gc.collect()

    CC = [6]
    gg = [-6.36, -6.35, -6.34, -6.33, -6.32]

    for ExecutionIndex, clf in enumerate(clfs):
        print clf
        avg = 0

        predicted_list = []

        dataset_blend_test_set = np.zeros((lenTest, NumFolds))

        foldCount = 0
        avg = 0

        #Stratified for classification...[trainBase[i][0] for i in range(len(trainBase))]
        Folds = cross_validation.KFold(lenTrainBase, k=NumFolds, indices=True)

        for C in CC:
            for g in gg:

                for train_index, test_index in Folds:

                    print "g:", g, "C:", C

                    #trainBaseTemp = [trainBase[i] for i in train_index]
                    #target = [x[0] for x in trainBaseTemp]
                    #train = [x[1:] for x in trainBaseTemp]

                    #testBaseTemp = [trainBase[i] for i in test_index]
                    #targetTest = [x[0] for x in testBaseTemp]
                    #trainTest = [x[1:] for x in testBaseTemp]

                    #test = [x[0:] for x in test]

                    target = [targetPre[i] for i in train_index]
                    train = [trainScaled[i] for i in train_index]

                    targetTest = [targetPre[i] for i in test_index]
                    trainTest = [trainScaled[i] for i in test_index]

                    print
                    print "Iteration: ", foldCount
                    print "LEN: ", len(train), len(
                        train[0]), len(target), len(trainTest), len(
                            trainTest[0])

                    clf = SVC(C=10**C,
                              kernel='rbf',
                              degree=4,
                              gamma=10**g,
                              coef0=0.0,
                              shrinking=True,
                              probability=False,
                              tol=0.001,
                              cache_size=200,
                              class_weight=None,
                              verbose=False)
                    #clf.set_params(C=10**C, gamma=2**g)

                    print datetime.datetime.now()
                    clf.fit(train, target)
                    print datetime.datetime.now()

                    prob = clf.predict(trainTest)

                    dataset_blend_train[test_index, ExecutionIndex] = prob

                    probSum = 0.0
                    count = 0.0

                    for i in range(0, len(prob)):
                        probX = prob[i]  #[1]
                        #print probX, targetTest[i]
                        if (targetTest[i] == probX):
                            probSum += 1.0
                        count = count + 1.0

                    print "Sum: ", probSum, count
                    print "Score: ", probSum / count

                    avg += (probSum / count) / NumFolds

                    #predicted_probs = clf.predict(testScaled)
                    ######predicted_list.append([x[1] for x in predicted_probs])
                    #dataset_blend_test_set[:, foldCount] = predicted_probs #[0]

                    foldCount = foldCount + 1

                    break

                #dataset_blend_test[:,ExecutionIndex] = dataset_blend_test_set.mean(1)

                now = datetime.datetime.now()

                #csv_io.write_delimited_file_single("../predictions/Stack_" + now.strftime("%Y%m%d%H%M%S") + "_" + str(avg) + "_" + str(clf)[:12] + ".csv", dataset_blend_test_set.mean(1))

                #csv_io.write_delimited_file_single("../predictions/Target_Stack_" + now.strftime("%Y%m%d%H%M%S") + "_" + str(avg) + "_" + str(clf)[:12] + ".csv", dataset_blend_train[:,ExecutionIndex] )

                csv_io.write_delimited_file("../tune/TuneLog.csv", [
                    now.strftime("%Y %m %d %H %M %S"), "Score:",
                    str(avg * NumFolds),
                    str(clf), "Folds:",
                    str(NumFolds), "Model", model, "", ""
                ],
                                            filemode="a",
                                            delimiter=",")

                #print "------------------------Average: ", avg

    return dataset_blend_train, dataset_blend_test
Example #55
0
def PreProcess5():

	trainBase = csv_io.read_data("PreProcessData/training_PreProcess4.csv", skipFirstLine = False, split = "\t")
	test = csv_io.read_data("PreProcessData/test_PreProcess4.csv", skipFirstLine = False, split = "\t")
	shutil.copy2("PreProcessData/DataClassList4.csv", "PreProcessData/DataClassList5.csv")	
	
	target = [x[0] for x in trainBase]
	train = [x[1:] for x in trainBase]
	
	DataClassList = csv_io.read_data("PreProcessData/DataClassList5.csv", False)
	
	print "Data len: ", len(train[0])
	print "DataClassList len: ", len(DataClassList)
	#return
	
	# this seems about optimal, but has not been tuned on latest improvements.
	NumFeatures = 40
	# NOTE going from 30 to 20 features on KNN5 set has almost no effect.  Down to 15 is significant loss.
	# for GBM at 6 and 400 30 is 3.01 and 30 3.05.
	
	print "Scaling"
	targetPre = [x[0] for x in trainBase]
	trainPre = [x[1:] for x in trainBase]
	testPre = [x[0:] for x in test]
	#print trainPre[0]
	scaler = preprocessing.Scaler().fit(trainPre)
	trainScaled = scaler.transform(trainPre)
	#testScaled = scaler.transform(testPre)	
	
	

	#clf = RandomForestClassifier(n_estimators=100, n_jobs=1, criterion='gini',compute_importances=True)
	clf = RandomForestRegressor(n_estimators=25, n_jobs=1,compute_importances=True) 
	#clf = ExtraTreesClassifier(n_estimators=150, min_density=0.02, n_jobs=1, criterion='gini', bootstrap=True,compute_importances=True)
	
	print "Training"
	# producing memory errors, probably too much data.
	# recommend to use linear lasso.
	#est = LinearRegression(fit_intercept=True, normalize=False, copy_X=True)
	#selector = RFE(est, 20, step=10)
	#selector = selector.fit(trainScaled, target)
	#print selector.support_
	#print selector.ranking_
	#return
	
	#trainPost = selector.transform(trainPre)
	#testPost = selector.transform(testPre)
	
	clf.fit(trainScaled, target)
		
	trainNew = []
	testNew = []

		
	print "Computing Importances"
	importances = clf.feature_importances_

	
	

	DataClassListNew = []
	for DataIndex, DataClass in enumerate(DataClassList):
		print DataClass[0], importances[DataIndex];
		DataClassListNew.append([DataClass[0], importances[DataIndex]])
		
	csv_io.write_delimited_file("PreProcessData/DataClassList_Importances_"  + str(NumFeatures) + ".csv", DataClassListNew)
	
	DataClassListNew_temp = sorted(DataClassListNew, key=operator.itemgetter(1), reverse=True)  
	csv_io.write_delimited_file("PreProcessData/DataClassList_Importances_sorted_"  + str(NumFeatures) + ".csv", DataClassListNew_temp)

	
	
	importancesTemp = sorted(importances, reverse=True)
	print len(importancesTemp), "importances"
				
	if ( len(importancesTemp) > NumFeatures):
		threshold = importancesTemp[NumFeatures]

		print "Importance threshold: ", threshold

		rowIndex = 0
		for row in train:
			newRow = []
			for impIndex, importance in enumerate(importances):
				if ( impIndex == 0):
					newRow.append(target[rowIndex])
				if ( importance > threshold ):	
					newRow.append(row[impIndex])
			trainNew.append(newRow)	
			rowIndex += 1
			
			
		for row in test:
			newRow = []
			for impIndex, importance in enumerate(importances):
				if ( importance > threshold ) :
					newRow.append(row[impIndex])
			testNew.append(newRow)	
				
	csv_io.write_delimited_file("PreProcessData/training_PreProcess5_" + str(NumFeatures) + ".csv", trainNew, delimiter="\t")		
	csv_io.write_delimited_file("PreProcessData/test_PreProcess5_" + str(NumFeatures) + ".csv", testNew, delimiter="\t")
Example #56
0
            print(str(averageSet[idx][ExecutionIndex]))
            average += averageSet[idx][ExecutionIndex]

            submission1[col] = dataset_blend_testSet[idx][:, ExecutionIndex]

            submission2[col] = dataset_blend_trainSet[idx][:, ExecutionIndex]

        average = average / 5
        now = datetime.datetime.now()

        submission1.to_csv("../submission/Stack" + dset + "_" +
                           now.strftime("%Y%m%d%H%M%S") + "_" + str(average) +
                           "_" + str(clf)[:12] + ".csv",
                           index=False)

        submission2.to_csv("../submission/Target_Stack" + dset + "_" +
                           now.strftime("%Y%m%d%H%M%S") + "_" + str(average) +
                           "_" + str(clf)[:12] + ".csv",
                           index=False)

        csv_io.write_delimited_file("../log/RunLog.csv", [
            now.strftime("%Y %m %d %H %M %S"), "AVG.",
            str(average),
            str(clf), "Folds:",
            str(NumFolds), "Model", model, "dset", dset
        ],
                                    filemode="a",
                                    delimiter=",")

        print("------------------------Final Average: " + str(average))