def Analyze1(): Threshold = 4.0 targetFile = "Target_Stack_20121017110223_3.06649134025_GradientBoos.csv" trainBase = csv_io.read_data("PreProcessData/training_PreProcess3.csv", skipFirstLine = False, split = "\t") shutil.copy2("PreProcessData/test_PreProcess3.csv", "PreProcessData/test_PreProcess8.csv") shutil.copy2("PreProcessData/DataClassList3.csv", "PreProcessData/DataClassList8.csv") weights = csv_io.read_data("PreProcessData/Weights.csv", skipFirstLine = False) target = [x[0] for x in trainBase] print "Loading Data" trainNew = [] probSum = 0.0 weightSum = 0 trn = csv_io.read_data("../predictions/" + targetFile, split="," ,skipFirstLine = False) for row, datum in enumerate(trn): if ( abs(datum[0] - target[row]) > Threshold): print datum[0], target[row] trainNew.append(trainBase[row]) probSum += weights[row][0] * math.fabs(target[row] - datum[0]) weightSum += weights[row][0] print "Train Score: ", (probSum/weightSum) print len(trainNew) csv_io.write_delimited_file("PreProcessData/training_PreProcess8" + ".csv", trainNew, delimiter="\t")
def main(): #read in the training file train = csv_io.read_data("data/train.csv") target = ravel(csv_io.read_data("data/trainLabels.csv")) realtest = csv_io.read_data("data/test.csv") print len(realtest) # random forest code rf = RandomForestClassifier(n_estimators=150, min_samples_split=2, n_jobs=-1, random_state=1, oob_score=True) # fit the training data print('fitting the model') rf.fit(train, target) # run model against test data predicted_probs = rf.predict_proba(realtest) predicted_class = rf.predict(realtest) print predicted_class[1:10] print(len(predicted_class)) predicted_probs = ["%f" % x[1] for x in predicted_probs] predicted_class = ["%d,%d" % (i+1, predicted_class[i]) for i in range(len(predicted_class))] print predicted_class[0:9] print(len(predicted_class)) csv_io.write_delimited_file("results/random_forest_solution.csv", predicted_class, header=['Id', 'Solution'])
def main(): #read in data, parse into training and target sets train = csv.read_data("../Data/train.csv") target = np.array( [x[0] for x in train] ) train = np.array( [x[1:280] for x in train] ) #In this case we'll use a random forest, but this could be any classifier cfr = RandomForestClassifier(n_estimators=120, min_samples_split=2, n_jobs=-1, max_depth=None) #.46 #cfr = GradientBoostingClassifier(n_estimators=120, learn_rate=0.57, max_depth=1) #.50 #cfr = ExtraTreesClassifier(n_estimators=120, max_depth=None, min_samples_split=1) #.489 #Simple K-Fold cross validation. 5 folds. cv = cross_validation.KFold(len(train), k=5, indices=False) #iterate through the training and test cross validation segments and #run the classifier on each one, aggregating the results into a list results = [] count = 0 for traincv, testcv in cv: probas = cfr.fit(train[traincv], target[traincv]).predict_proba(train[testcv]) result = logloss.llfun(target[testcv], [x[1] for x in probas]) count += 1 print('fold: %d, result: %f' % (count, result)) results.append( result ) #print out the mean of the cross-validated results print "Results: " + str( np.array(results).mean() ) test = csv.read_data("../Data/test.csv") predicted_probs = cfr.predict_proba( [x[0:279] for x in test]) predicted_probs = ["%f" % x[1] for x in predicted_probs] csv.write_delimited_file("../Submissions/rf_cv.csv", predicted_probs)
def main(): #read in the training file train = csv_io.read_data("../data/train.csv") #set the training responses target = [x[0] for x in train] #set the training features train = [x[1:] for x in train] #read in the test file realtest = csv_io.read_data("../data/test.csv") # random forest code rf = RandomForestClassifier(n_estimators=150, min_samples_split=2, n_jobs=-1) # fit the training data print('fitting the model') rf.fit(train, target) # run model against test data predicted_probs = rf.predict_proba(realtest) predicted_probs = ["%f" % x[1] for x in predicted_probs] csv_io.write_delimited_file("random_forest_solution.csv", predicted_probs) print( 'Random Forest Complete! You Rock! Submit random_forest_solution.csv to Kaggle' )
def PreProcess3(): trainBase = csv_io.read_data( "PreProcessData/training_PreProcess2_temp.csv", False) test = csv_io.read_data("PreProcessData/test_PreProcess2_temp.csv", False) target = [x[0] for x in trainBase] train = [x[1:] for x in trainBase] NumFeatures = 200 #clf = RandomForestClassifier(n_estimators=100, n_jobs=1, criterion='gini',compute_importances=True) chi = chi2(train, target) print "Training" #clf.fit(train, target) chi = SelectKBest(chi2, k=NumFeatures).fit(train, target) print chi.get_support(indices=True) print chi.transform(X), np.array(train)[:, [0]] return trainNew = [] testNew = [] print "Computing Importances" importances = clf.feature_importances_ #print importances importancesTemp = sorted(importances, reverse=True) print len(importancesTemp), "importances" if (len(importancesTemp) > NumFeatures): threshold = importancesTemp[NumFeatures] #print "Sorted and deleted importances" #print importancesTemp rowIndex = 0 for row in train: newRow = [] for impIndex, importance in enumerate(importances): if (impIndex == 0): newRow.append(target[rowIndex]) if (importance > threshold): newRow.append(row[impIndex]) trainNew.append(newRow) rowIndex += 1 for row in test: newRow = [] for impIndex, importance in enumerate(importances): if (importance > threshold): #print impIndex, len(importances) newRow.append(row[impIndex]) testNew.append(newRow) csv_io.write_delimited_file("PreProcessData/training_PreProcess2_chi.csv", trainNew) csv_io.write_delimited_file("PreProcessData/test_PreProcess2_chi.csv", testNew)
def Blend(): trainBase = csv_io.read_data("PreProcessData/training_PreProcess4.csv", skipFirstLine = False, split = "\t") weights = csv_io.read_data("PreProcessData/Weights.csv", skipFirstLine = False) SEED = 448 #random.seed(SEED) #random.shuffle(trainBase) target = [x[0] for x in trainBase] dataset_blend_train, dataset_blend_test = stack.run_stack(SEED) clfs = [LinearRegression(fit_intercept=True, normalize=False, copy_X=True) ] # clfs = [LogisticRegression(penalty='l2', dual=False, tol=0.0001, C=1.0, fit_intercept=True, intercept_scaling=1, class_weight=None), # LogisticRegression(penalty='l2', dual=False, tol=0.0001, C=0.5, fit_intercept=True, intercept_scaling=1, class_weight=None), # LogisticRegression(penalty='l2', dual=False, tol=0.0001, C=0.1, fit_intercept=True, intercept_scaling=1, class_weight=None), # LogisticRegression(penalty='l1', dual=False, tol=0.0001, C=1.0, fit_intercept=True, intercept_scaling=1, class_weight=None), # LogisticRegression(penalty='l1', dual=False, tol=0.0001, C=0.5, fit_intercept=True, intercept_scaling=1, class_weight=None), # LogisticRegression(penalty='l1', dual=False, tol=0.0001, C=0.1, fit_intercept=True, intercept_scaling=1, class_weight=None)] test = csv_io.read_data("PreProcessData/test_PreProcess4.csv", False) dataset_blend_test_set = np.zeros((len(test), len(clfs))) for ExecutionIndex, clf in enumerate(clfs): clf.fit(dataset_blend_train, target) submission = clf.predict(dataset_blend_test) submission = ["%f" % x for x in submission] now = datetime.datetime.now() csv_io.write_delimited_file("../Submissions/BlendSingle" + now.strftime("%Y%m%d%H%M%S") + ".csv", submission) # attempt to score the training set to predict score for blend... probSum = 0.0 weightSum = 0 trainPrediction = clf.predict(dataset_blend_train) for i in range(0, len(trainPrediction)): probX = trainPrediction[i] probSum += weights[i][0] * math.fabs(target[i] - probX) weightSum += weights[i][0] #probSum += int(target[i])*log(probX)+(1-int(target[i]))*log(1-probX) print "Train Score: ", (probSum/weightSum) dataset_blend_test_set[:, ExecutionIndex] = submission csv_io.write_delimited_file_single("../Submissions/FinalBlend_" + now.strftime("%Y%m%d%H%M%S") + ".csv", dataset_blend_test_set.mean(1))
def Blend(): trainBase = csv_io.read_data("PreProcessData/training_PreProcess4.csv", skipFirstLine=False, split="\t") weights = csv_io.read_data("PreProcessData/Weights.csv", skipFirstLine=False) SEED = 448 #random.seed(SEED) #random.shuffle(trainBase) target = [x[0] for x in trainBase] dataset_blend_train, dataset_blend_test = stack.run_stack(SEED) clfs = [LinearRegression(fit_intercept=True, normalize=False, copy_X=True)] # clfs = [LogisticRegression(penalty='l2', dual=False, tol=0.0001, C=1.0, fit_intercept=True, intercept_scaling=1, class_weight=None), # LogisticRegression(penalty='l2', dual=False, tol=0.0001, C=0.5, fit_intercept=True, intercept_scaling=1, class_weight=None), # LogisticRegression(penalty='l2', dual=False, tol=0.0001, C=0.1, fit_intercept=True, intercept_scaling=1, class_weight=None), # LogisticRegression(penalty='l1', dual=False, tol=0.0001, C=1.0, fit_intercept=True, intercept_scaling=1, class_weight=None), # LogisticRegression(penalty='l1', dual=False, tol=0.0001, C=0.5, fit_intercept=True, intercept_scaling=1, class_weight=None), # LogisticRegression(penalty='l1', dual=False, tol=0.0001, C=0.1, fit_intercept=True, intercept_scaling=1, class_weight=None)] test = csv_io.read_data("PreProcessData/test_PreProcess4.csv", False) dataset_blend_test_set = np.zeros((len(test), len(clfs))) for ExecutionIndex, clf in enumerate(clfs): clf.fit(dataset_blend_train, target) submission = clf.predict(dataset_blend_test) submission = ["%f" % x for x in submission] now = datetime.datetime.now() csv_io.write_delimited_file( "../Submissions/BlendSingle" + now.strftime("%Y%m%d%H%M%S") + ".csv", submission) # attempt to score the training set to predict score for blend... probSum = 0.0 weightSum = 0 trainPrediction = clf.predict(dataset_blend_train) for i in range(0, len(trainPrediction)): probX = trainPrediction[i] probSum += weights[i][0] * math.fabs(target[i] - probX) weightSum += weights[i][0] #probSum += int(target[i])*log(probX)+(1-int(target[i]))*log(1-probX) print "Train Score: ", (probSum / weightSum) dataset_blend_test_set[:, ExecutionIndex] = submission csv_io.write_delimited_file_single( "../Submissions/FinalBlend_" + now.strftime("%Y%m%d%H%M%S") + ".csv", dataset_blend_test_set.mean(1))
def PreProcess3(): filename = "stack201208301510" data = csv_io.read_data("../Submissions/" + filename + ".csv", False) data = SimpleScale( data, floor=0.05, ceiling=0.90) # took 0.389 score an lowered to 0.40, not good... csv_io.write_delimited_file( "../Submissions/" + filename + "_SimpleScale.csv", data)
def main(): train = csv_io.read_data("../Data/train.csv") target = [x[0] for x in train] train = [x[1:] for x in train] test = csv_io.read_data("../Data/test.csv") rf = RandomForestClassifier(n_estimators=100, min_split=2) rf.fit(train, target) predicted_probs = rf.predict_proba(test) predicted_probs = ["%f" % x[1] for x in predicted_probs] csv_io.write_delimited_file("../Submissions/rf_benchmark.csv", predicted_probs)
def main(): train = csv_io.read_data("../data/train.csv") target = [x[0] for x in train] train = [x[1:] for x in train] test = csv_io.read_data("../data/test.csv") svc = svm.SVC(probability=True) svc.fit(train, target) predicted_probs = svc.predict_proba(test) predicted_probs = ["%f" % x[1] for x in predicted_probs] csv_io.write_delimited_file("../submissions/svm_benchmark.csv", predicted_probs)
def main(): train = csv_io.read_data("../Data/train.csv") targets = [int(x[0]) for x in train] num_targets = len(targets) num_ones = np.sum(targets) optimized_value = float(num_ones) / num_targets test = csv_io.read_data("../Data/test.csv") predicted_probs = ["%f" % optimized_value for x in test] csv_io.write_delimited_file("../Submissions/optimized_value_benchmark.csv", predicted_probs)
def main(): train = csv_io.read_data("../Data/train.csv") target = [x[0] for x in train] train = [x[1:] for x in train] test = csv_io.read_data("../Data/test.csv") svc = svm.SVC(probability=True) svc.fit(train, target) predicted_probs = svc.predict_proba(test) predicted_probs = ["%f" % x[1] for x in predicted_probs] csv_io.write_delimited_file("../Submissions/svm_benchmark.csv", predicted_probs)
def main(): training, target = csv_io.read_data("../Data/train.csv") test, throwaway = csv_io.read_data("../Data/test.csv") n_test = len(test) n_target = len(set(target)) predicted_probs = [[0.001 for x in range(n_target)] for y in range(n_test)] predicted_probs = [["%f" % x for x in y] for y in predicted_probs] csv_io.write_delimited_file("../Submissions/uniform_benchmark.csv", predicted_probs)
def PreProcessRun(dataSet): print print "DataSet: ", dataSet print "Loading Data" data = csv_io.read_data("PreProcessData/" + dataSet + "_PreProcess.csv", split="\t", skipFirstLine=False) print dataSet, "Size: ", len(data[0]) if (dataSet == "training"): # do only once. shutil.copy2("PreProcessData/DataClassList.csv", "PreProcessData/DataClassList1.csv") DataClassList = csv_io.read_data("PreProcessData/DataClassList1.csv", False) offset = 0 offset2 = 0 if (dataSet == "test"): offset = 1 offset2 = -1 print DataClassList print "Appending New Data" firstTime = True for row in data: text = "" val = row[136 + offset2] / row[139 + offset2] row.append(val) if (firstTime and dataSet == "training"): # do only once. text = DataClassList[135 + offset][0] + "_DIV_" + DataClassList[ 139 + offset][0] csv_io.write_delimited_file("PreProcessData/DataClassList1.csv", [text], filemode="a") if (firstTime): print row[136 + offset2], row[139 + offset2], val, text firstTime = False csv_io.write_delimited_file("PreProcessData/" + dataSet + "_PreProcess1.csv", data, delimiter="\t") print "Done."
def main(): training, target = csv_io.read_data("../Data/train.csv") training = [x[1:] for x in training] target = [float(x) for x in target] test, throwaway = csv_io.read_data("../Data/test.csv") test = [x[1:] for x in test] svc = svm.SVC(probability=True) svc.fit(training, target) predicted_probs = svc.predict_proba(test) predicted_probs = [[min(max(x,0.001),0.999) for x in y] for y in predicted_probs] predicted_probs = [["%f" % x for x in y] for y in predicted_probs] csv_io.write_delimited_file("../Submissions/svm_benchmark.csv", predicted_probs)
def main(): training, target = csv_io.read_data("../Data/train.csv") training = [x[1:] for x in training] target = [float(x) for x in target] test, throwaway = csv_io.read_data("../Data/test.csv") test = [x[1:] for x in test] rf = RandomForestClassifier(n_estimators=100, min_split=2) rf.fit(training, target) predicted_probs = rf.predict_proba(test) predicted_probs = [[min(max(x,0.001),0.999) for x in y] for y in predicted_probs] predicted_probs = [["%f" % x for x in y] for y in predicted_probs] csv_io.write_delimited_file("../Submissions/rf_benchmark.csv", predicted_probs)
def main(): training, target = csv_io.read_data("../Data/train.csv") training = [x[1:] for x in training] target = [float(x) for x in target] test, throwaway = csv_io.read_data("../Data/test.csv") test = [x[1:] for x in test] rf = RandomForestClassifier(n_estimators=100, min_split=2) rf.fit(training, target) predicted_probs = rf.predict_proba(test) predicted_probs = [[min(max(x, 0.001), 0.999) for x in y] for y in predicted_probs] predicted_probs = [["%f" % x for x in y] for y in predicted_probs] csv_io.write_delimited_file("../Submissions/rf_benchmark.csv", predicted_probs)
def main(): np.random.seed(42) #read in the training file modelone = np.asarray(csv_io.read_data("results/gmm_pca12_6.csv",header=True)) modeltwo = np.asarray(csv_io.read_data("results/random_forest_solution-12pca-4.csv",header=True)) modelthree= np.asarray(csv_io.read_data("results/svm_pca12_5.csv",header=True)) bagmodel = np.column_stack((modelone[:,1], modeltwo[:,1], modelthree[:,1])) bagsum = bagmodel.sum(axis=1) predicted_class = np.zeros(bagsum.shape) predicted_class[bagsum >=2] = 1 predicted_class = ["%d,%d" % (i+1, predicted_class[i]) for i in range(len(predicted_class))] print predicted_class[0:9] print(len(predicted_class)) csv_io.write_delimited_file("results/bagging_solution_7.csv", predicted_class, header=['Id', 'Solution'])
def main(): training, target = csv_io.read_data("../Data/train.csv") training = [x[1:] for x in training] target = [float(x) for x in target] test, throwaway = csv_io.read_data("../Data/test.csv") test = [x[1:] for x in test] svc = svm.SVC(probability=True) svc.fit(training, target) predicted_probs = svc.predict_proba(test) predicted_probs = [[min(max(x, 0.001), 0.999) for x in y] for y in predicted_probs] predicted_probs = [["%f" % x for x in y] for y in predicted_probs] csv_io.write_delimited_file("../Submissions/svm_benchmark.csv", predicted_probs)
def main(strat = False, visualization = False): #read in the training file X = csv_io.read_data("data/train.csv") target = ravel(csv_io.read_data("data/trainLabels.csv")) realtest = csv_io.read_data("data/test.csv") print len(realtest) #pca pca = PCA(n_components=num_pca) pca.fit(X) train = pca.transform(X) test_transformed = pca.transform(realtest) print('performed pca') # random forest code clf = svm.SVC() if strat: print "stratified cross-validation on shuffled data" # adapted from http://stackoverflow.com/a/8281241 crossval = [] for i in range(strat): X, y = shuffle(train, target, random_state=i) skf = StratifiedKFold(y, 10) crossval.append([min(cross_val_score(clf, X, y, cv=skf)), np.median(cross_val_score(clf, X, y, cv=skf)), max(cross_val_score(clf, X, y, cv=skf))]) print crossval if visualization: print "preparing visualization" data_train, data_test, target_train, target_test = train_test_split(train, target, test_size=0.20, random_state=42) plot1 = drawLearningCurve(clf, data_train, target_train, data_test, target_test) pp = PdfPages('figures/learningCurve.pdf') pp.savefig(plot1) pp.close() print('fitting the model') clf.fit(train, target) # run model against test data predicted_class = clf.predict(test_transformed) print predicted_class[0:9] print(len(predicted_class)) print('Writing output') predicted_class = ["%d,%d" % (i+1, predicted_class[i]) for i in range(len(predicted_class))] print predicted_class[0:9] print(len(predicted_class)) csv_io.write_delimited_file("results/svm_pca12_5.csv", predicted_class, header=['Id', 'Solution']) print ('Finished. Exiting.')
def PreProcessRun(dataSet): print print "DataSet: ", dataSet print "Loading Data" data = csv_io.read_data("PreProcessData/" + dataSet + "_PreProcess1.csv", split="\t" ,skipFirstLine = False) print dataSet, "Size: ", len(data[0]) if ( os.path.exists("PreProcessData/" + dataSet + "_PreProcess2.csv") ): os.remove("PreProcessData/" + dataSet + "_PreProcess2.csv") SkipArr = [0,2,4,172] DataClassList = csv_io.read_data("PreProcessData/DataClassList1.csv", False) DataClassListNew = [] firstTime = True for index, item in enumerate(data): rowNew = [] #print item for index, val in enumerate(item): if dataSet == "training" and (index - 1) in SkipArr: continue elif dataSet == "test" and index in SkipArr: continue rowNew.append(val) #print val if dataSet == "test" and firstTime == True: print DataClassList[index] DataClassListNew.append(DataClassList[index]) csv_io.write_delimited_file("PreProcessData/" + dataSet + "_PreProcess2.csv", [copy.deepcopy(rowNew)], filemode="a", delimiter="\t") firstTime = False if dataSet == "test": csv_io.write_delimited_file("PreProcessData/DataClassList2.csv", DataClassListNew) print "Done."
def main(): et = csv_io.read_data("../Submissions/et_stack_avg_benchmark.csv", False) rbf = csv_io.read_data("../Submissions/svm-rbf-bootstrap-stack_meanSpan_benchmark.csv", False) poly = csv_io.read_data("../Submissions/svm-poly-bootstrap-stack_meanSpan_benchmark.csv", False) rf = csv_io.read_data("../Submissions/rf2_avg_benchmark.csv", False) gb = csv_io.read_data("../Submissions/gb_avg_benchmark.csv", False) stack = [] stack.append(et) stack.append(rbf) stack.append(poly) stack.append(rf) stack.append(gb) spanDistance = 3 finalList = [] for p in range(0, len(stack[0])): temp_list =[] for q in range(0, len(stack)): temp_list.append( stack[q][p][0]) avg = sum(temp_list)/float(len(stack)) if ( avg < 0.5 ): finalList.append(0.2) #finalList.append(min(temp_list)) print p, q, temp_list, avg, min(temp_list) else: finalList.append(0.80) #finalList.append(max(temp_list)) print p, q, temp_list, avg, max(temp_list) #finalList.append( meanSpan(temp_list, spanDistance) ) #print p, q, temp_list, meanSpan(temp_list, spanDistance) finalStack = ["%f" % x for x in finalList] csv_io.write_delimited_file("../Submissions/stack.csv", finalStack) var = raw_input("Enter to terminate.")
def main(): et = csv_io.read_data("../Submissions/et_stack_avg_benchmark.csv", False) rbf = csv_io.read_data( "../Submissions/svm-rbf-bootstrap-stack_meanSpan_benchmark.csv", False) poly = csv_io.read_data( "../Submissions/svm-poly-bootstrap-stack_meanSpan_benchmark.csv", False) rf = csv_io.read_data("../Submissions/rf2_avg_benchmark.csv", False) gb = csv_io.read_data("../Submissions/gb_avg_benchmark.csv", False) stack = [] stack.append(et) stack.append(rbf) stack.append(poly) stack.append(rf) stack.append(gb) spanDistance = 3 finalList = [] for p in range(0, len(stack[0])): temp_list = [] for q in range(0, len(stack)): temp_list.append(stack[q][p][0]) avg = sum(temp_list) / float(len(stack)) if (avg < 0.5): finalList.append(0.2) #finalList.append(min(temp_list)) print p, q, temp_list, avg, min(temp_list) else: finalList.append(0.80) #finalList.append(max(temp_list)) print p, q, temp_list, avg, max(temp_list) #finalList.append( meanSpan(temp_list, spanDistance) ) #print p, q, temp_list, meanSpan(temp_list, spanDistance) finalStack = ["%f" % x for x in finalList] csv_io.write_delimited_file("../Submissions/stack.csv", finalStack) var = raw_input("Enter to terminate.")
def main(): #read in the training file train = csv_io.read_data("train.csv") #set the training responses target = [x[0] for x in train] #set the training features train = [x[1:] for x in train] #read in the test file realtest = csv_io.read_data("test.csv") # code for logistic regression lr = LogisticRegression() lr.fit(train, target) predicted_probs = lr.predict_proba(realtest) # write solutions to file predicted_probs = ["%f" % x[1] for x in predicted_probs] csv_io.write_delimited_file("log_solution.csv", predicted_probs) print ('Logistic Regression Complete! Submit log_solution.csv to Kaggle')
def Analyze1(): Threshold = 4.0 targetFile = "Target_Stack_20121017110223_3.06649134025_GradientBoos.csv" trainBase = csv_io.read_data("PreProcessData/training_PreProcess3.csv", skipFirstLine=False, split="\t") shutil.copy2("PreProcessData/test_PreProcess3.csv", "PreProcessData/test_PreProcess8.csv") shutil.copy2("PreProcessData/DataClassList3.csv", "PreProcessData/DataClassList8.csv") weights = csv_io.read_data("PreProcessData/Weights.csv", skipFirstLine=False) target = [x[0] for x in trainBase] print "Loading Data" trainNew = [] probSum = 0.0 weightSum = 0 trn = csv_io.read_data("../predictions/" + targetFile, split=",", skipFirstLine=False) for row, datum in enumerate(trn): if (abs(datum[0] - target[row]) > Threshold): print datum[0], target[row] trainNew.append(trainBase[row]) probSum += weights[row][0] * math.fabs(target[row] - datum[0]) weightSum += weights[row][0] print "Train Score: ", (probSum / weightSum) print len(trainNew) csv_io.write_delimited_file("PreProcessData/training_PreProcess8" + ".csv", trainNew, delimiter="\t")
def main(): # read in the training file train = csv_io.read_data("train.csv") # set the training responses target = [x[0] for x in train] # set the training features train = [x[1:] for x in train] # read in the test file realtest = csv_io.read_data("test.csv") # random forest code rf = RandomForestClassifier(n_estimators=150, min_samples_split=2, n_jobs=-1) # fit the training data print("fitting the model") rf.fit(train, target) # run model against test data predicted_probs = rf.predict_proba(realtest) predicted_probs = ["%f" % x[1] for x in predicted_probs] csv_io.write_delimited_file("random_forest_solution.csv", predicted_probs) print("Random Forest Complete! You Rock! Submit random_forest_solution.csv to Kaggle")
def main(): #read in the training file train = csv_io.read_data("train.csv") #set the training responses target = [x[0] for x in train] #set the training features train = [x[1,3,4,5,6] for x in train] #read in the test file realtest = csv_io.read_data("test.csv") # random forest code rf = RandomForestClassifier(n_estimators=10) # fit the training data print('fitting the model') rf.fit(train, target) # run model against test data predicted_probs = rf.predict_proba(realtest) predicted_probs = ["%f" % x[1] for x in predicted_probs] csv_io.write_delimited_file("random_forest_solution.csv", predicted_probs)
def main(): #read in the training file train = csv_io.read_data("train.csv") #set the training responses target = [x[0] for x in train] #set the training features train = [x[1, 3, 4, 5, 6] for x in train] #read in the test file realtest = csv_io.read_data("test.csv") # random forest code rf = RandomForestClassifier(n_estimators=10) # fit the training data print('fitting the model') rf.fit(train, target) # run model against test data predicted_probs = rf.predict_proba(realtest) predicted_probs = ["%f" % x[1] for x in predicted_probs] csv_io.write_delimited_file("random_forest_solution.csv", predicted_probs)
def main(): startCol = 0 endCol = 1775 # max = 1775 train = csv_io.read_data("../Data/train.csv") target = [x[0] for x in train][1:3000] targetTest = [x[0] for x in train][3001:] trainTest = [x[startCol + 1:endCol + 1] for x in train][3001:] test = csv_io.read_data("../Data/test.csv") test = [x[startCol:endCol] for x in test] train = [x[startCol + 1:endCol + 1] for x in train][1:3000] fo = open("knn_stats.txt", "a+") #n_neighbors=15, weights='distance' return 0.65 #n_neighbors=3, weights='distance' 0.60 rf = neighbors.KNeighborsClassifier(n_neighbors=3, weights='distance', algorithm='brute', leaf_size=100, warn_on_equidistant=True, p=2) # 'distance' rf.fit(train, target) prob = rf.predict(trainTest) # changed from test result = 100 probSum = 0 for i in range(0, len(prob)): probX = prob[i] # [1] if (probX > 0.9): probX = 0.9 if (probX < 0.1): probX = 0.1 print i, probSum, probX, target[i] print target[i] * log(probX), (1 - target[i]) * log(1 - probX) probSum += targetTest[i] * log(probX) + ( 1 - targetTest[i]) * log(1 - probX) #print probSum #print len(prob) #print "C: ", 10**C, " gamma: " ,2**g print -probSum / len(prob) if (-probSum / len(prob) < result): result = -probSum / len(prob) predicted_probs = rf.predict(test) # was test predicted_probs = ["%f" % x for x in predicted_probs] csv_io.write_delimited_file("../Submissions/knn.csv", predicted_probs) print "Generated Data!!" #fo.write(str(5) + str(5)+ str(5)); fo.close() #csv_io.write_delimited_file("../Submissions/rf_benchmark_test2.csv", predicted_probs) #predicted_probs = rf.predict_proba(train) # changed from test #predicted_probs = ["%f" % x[1] for x in predicted_probs] #predicted_probs = rf.predict(train) # changed from test #predicted_probs = ["%f" % x for x in predicted_probs] #csv_io.write_delimited_file("../Submissions/rf_benchmark_train2.csv", predicted_probs) var = raw_input("Enter to terminate.")
def PreProcess4(N_Features): trainBase = csv_io.read_data("PreProcessData/training_PreProcess3.csv", skipFirstLine=False, split="\t") test = csv_io.read_data("PreProcessData/test_PreProcess3.csv", skipFirstLine=False, split="\t") shutil.copy2("PreProcessData/DataClassList3.csv", "PreProcessData/DataClassList4.csv") target = [x[0] for x in trainBase] train = [x[1:] for x in trainBase] DataClassList = csv_io.read_data("PreProcessData/DataClassList4.csv", False) print "Data len: ", len(train[0]) print "DataClassList len: ", len(DataClassList) #return # this seems about optimal, but has not been tuned on latest improvements. NumFeatures = N_Features # NOTE going from 30 to 20 features on KNN5 set has almost no effect. Down to 15 is significant loss. # for GBM at 6 and 400 30 is 3.01 and 30 3.05. print "Scaling" term = 5000 # scaler has memory errors between 5000 and 10000 #term = len(trainBase) targetPre = [x[0] for x in trainBase][0:term] trainPre = [x[1:] for x in trainBase][0:term] #testPre = [x[0:] for x in test][0:term] targetPre = target[0:term] #print trainPre[term - 1] scaler = preprocessing.Scaler().fit(trainPre) trainScaled = scaler.transform(trainPre) #testScaled = scaler.transform(testPre) #clf = RandomForestRegressor(n_estimators=25, n_jobs=1,compute_importances=True) clf = GradientBoostingRegressor(loss='ls', learn_rate=0.05, subsample=0.5, max_depth=6, n_estimators=400, random_state=166, min_samples_leaf=30) print "Training" clf.fit(trainScaled, targetPre) trainNew = [] testNew = [] print "Computing Importances" importances = clf.feature_importances_ DataClassListNew = [] for DataIndex, DataClass in enumerate(DataClassList): print DataClass[0], importances[DataIndex] DataClassListNew.append([DataClass[0], importances[DataIndex]]) csv_io.write_delimited_file( "PreProcessData/DataClassList_Importances_" + str(NumFeatures) + ".csv", DataClassListNew) DataClassListNew_temp = sorted(DataClassListNew, key=operator.itemgetter(1), reverse=True) csv_io.write_delimited_file( "PreProcessData/DataClassList_Importances_sorted_" + str(NumFeatures) + ".csv", DataClassListNew_temp) importancesTemp = sorted(importances, reverse=True) print len(importancesTemp), "importances" if (len(importancesTemp) > NumFeatures): threshold = importancesTemp[NumFeatures] print "Importance threshold: ", threshold rowIndex = 0 for row in train: newRow = [] for impIndex, importance in enumerate(importances): if (impIndex == 0): newRow.append(target[rowIndex]) if (importance > threshold): newRow.append(row[impIndex]) trainNew.append(newRow) rowIndex += 1 for row in test: newRow = [] for impIndex, importance in enumerate(importances): if (importance > threshold): newRow.append(row[impIndex]) testNew.append(newRow) csv_io.write_delimited_file("PreProcessData/training_PreProcess4_" + str(NumFeatures) + ".csv", trainNew, delimiter="\t") csv_io.write_delimited_file("PreProcessData/test_PreProcess4_" + str(NumFeatures) + ".csv", testNew, delimiter="\t")
def run_stack(SEED, col, alpha): model = "Lasso" lossThreshold = 0.46 trainBaseTarget = pd.read_csv('../preprocess/pre_shuffled_target_' + col + '.csv') trainBaseOrig = pd.read_csv('../models/' + model + dset + '_train_' + col + '.csv') testOrig = pd.read_csv('../models/' + model + dset + '_test_' + col + '.csv') targetBase = np.nan_to_num(np.array(trainBaseTarget)) #print(trainBase.columns) trainBaseID = trainBaseOrig['PIDN'] testID = testOrig['PIDN'] avg = 0 NumFolds = 5 # ---------------------- stackFiles = [] for filename in os.listdir("../predictions"): parts = filename.split("_") if (filename[0:5] == "Stack" and "Lasso" in filename and float(parts[2]) < lossThreshold): # and "Lasso" in filename stackFiles.append(filename) trainBase = np.zeros((len(trainBaseOrig), len(stackFiles))) test = np.zeros((len(testOrig), len(stackFiles))) # first col is PIDN, after that we have 'Ca','P','pH','SOC','Sand', so we need to add 1 if col == 'Ca': targetCol = 1 elif col == 'P': targetCol = 2 elif col == 'pH': targetCol = 3 elif col == 'SOC': targetCol = 4 elif col == 'Sand': targetCol = 5 print("Loading Data") for fileNum, file in enumerate(stackFiles): print(file) trn = csv_io.read_data( "../predictions/Target_" + file, split=",", skipFirstLine=True) # skip first because of header. for row, datum in enumerate(trn): trainBase[row, fileNum] = datum[targetCol] tst = csv_io.read_data( "../predictions/" + file, split=",", skipFirstLine=True) # skip first because of header. for row, datum in enumerate(tst): test[row, fileNum] = datum[targetCol] np.savetxt('temp/dataset_blend_train.txt', trainBase) np.savetxt('temp/dataset_blend_test.txt', test) print("Num file processed: " + " " + str(len(stackFiles)) + " " + "Threshold: " + str(lossThreshold)) print("Starting Scale") allVals = np.vstack((trainBase, test)) scl = StandardScaler(copy=True, with_mean=True, with_std=True) scl.fit(allVals) # should fit on the combined sets. trainBase = scl.transform(trainBase) test = scl.transform(test) print("Starting Blend") # -------------------------------- clfs = [ Lasso(alpha=alpha), ] print("Data size: " + str(len(trainBase)) + " " + str(len(test))) dataset_blend_train = np.zeros((len(trainBase), len(clfs))) dataset_blend_test = np.zeros((len(test), len(clfs))) averageSet = [] print("Begin Training") lenTrainBase = len(trainBase) lenTest = len(test) gc.collect() for ExecutionIndex, clf in enumerate(clfs): print(clf) avg = 0 dataset_blend_test_set = np.zeros((lenTest, NumFolds)) foldCount = 0 Folds = cross_validation.KFold(lenTrainBase, n_folds=NumFolds, indices=True) for train_index, test_index in Folds: print() print("Iteration: " + str(foldCount)) now = datetime.datetime.now() print(now.strftime("%Y/%m/%d %H:%M:%S")) target = [targetBase[i] for i in train_index] train = [trainBase[i] for i in train_index] targetTest = [targetBase[i] for i in test_index] trainTest = [trainBase[i] for i in test_index] #print "LEN: ", len(train), len(target) target = np.array(np.reshape(target, (-1, 1))) targetTest = np.array(np.reshape(targetTest, (-1, 1))) #clf.fit(train, target, sample_weight = weight clf.fit(train, target) predicted = clf.predict(trainTest) #print(predicted[:,0]) #print(predicted) dataset_blend_train[ test_index, ExecutionIndex] = predicted #[:,0] #needed for Ridge #print(targetTest.shape) #print(prpredictedob.shape) #print(weightTest.shape) print(str(math.sqrt(mean_squared_error(targetTest, predicted)))) avg += math.sqrt(mean_squared_error(targetTest, predicted)) / NumFolds predicted = clf.predict(test) dataset_blend_test_set[:, foldCount] = predicted #[:,0] foldCount = foldCount + 1 #break averageSet.extend([avg]) dataset_blend_test[:, ExecutionIndex] = dataset_blend_test_set.mean(1) now = datetime.datetime.now() #print dataset_blend_test_set.mean(1) #csv_io.write_delimited_file_single("../predictions/Stack_" + now.strftime("%Y%m%d%H%M%S") + "_" + str(avg) + "_" + str(clf)[:12] + ".csv", dataset_blend_test_set.mean(1)) submission = pd.DataFrame(np.zeros((len(testID), 2)), columns=['PIDN', col]) submission[col] = dataset_blend_test[:, ExecutionIndex] submission['PIDN'] = testID submission.to_csv("../submission/temp/Blend_" + dset + "_" + now.strftime("%Y%m%d%H%M%S") + "_" + str(avg) + "_" + str(clf)[:12] + "_" + col + ".csv", index=False) #csv_io.write_delimited_file_single("../predictions/Target_Stack_" + now.strftime("%Y%m%d%H%M%S") + "_" + str(avg) + "_" + str(clf)[:12] + ".csv", dataset_blend_train[:,ExecutionIndex] ) submission = pd.DataFrame(np.zeros((len(trainBaseID), 2)), columns=['PIDN', col]) submission[col] = dataset_blend_train[:, ExecutionIndex] submission['PIDN'] = trainBaseID submission.to_csv("../submission/temp/Target_Blend_" + dset + "_" + now.strftime("%Y%m%d%H%M%S") + "_" + str(avg) + "_" + str(clf)[:12] + "_" + col + ".csv", index=False) csv_io.write_delimited_file("../log/partial/RunLogBlend.csv", [ now.strftime("%Y %m %d %H %M %S"), "AVG.", str(avg), str(clf), "Folds:", str(NumFolds), "Model", "Blend", "Stacks", stackFiles ], filemode="a", delimiter=",") print("------------------------Average: " + str(avg)) #np.savetxt('temp/dataset_blend_train.txt', dataset_blend_train) return dataset_blend_train, dataset_blend_test, averageSet, clfs, NumFolds, model
def run_stack(SEED): model = "" print "Running Stack." avg = 0 NumFolds = 5 # 5 is good, but 10 yeilds a better mean since outliers are less significant. targetX = csv_io.read_data("target.csv", skipFirstLine = False, split = ",") trainBase = csv_io.read_data("train1.csv", skipFirstLine = False, split = ",") #test = csv_io_np.read_data("test1.csv", skipFirstLine = False, split = ",") trainBase = trainBase[0:5000] targetX = targetX[0:5000] train_saleID = csv_io.read_data("train_salesID.csv", skipFirstLine = False, split = ",") test_salesID = csv_io.read_data("test_salesID.csv", skipFirstLine = False, split = ",") predicted_list = [] bootstrapLists = [] clfs = [ GradientBoostingRegressor(loss='ls', learn_rate=0.05, subsample=0.5, max_depth=6, n_estimators=300, random_state=166, min_samples_leaf=1) ] #GradientBoostingRegressor(loss='ls', learn_rate=0.05, subsample=0.5, max_depth=6, n_estimators=300, random_state=166, min_samples_leaf=1) #GradientBoostingRegressor(loss='lad', learn_rate=0.05, subsample=0.5, max_depth=6, n_estimators=3000, random_state=166, min_samples_leaf=1), #GradientBoostingRegressor(loss='huber', learn_rate=0.05, subsample=0.5, max_depth=6, n_estimators=3000, random_state=166, min_samples_leaf=1), print "Data size: ", len(trainBase) , 11573 # len(test) dataset_blend_train = np.zeros((len(trainBase), len(clfs))) #dataset_blend_test = np.zeros((len(test), len(clfs))) dataset_blend_test = np.zeros(11573, len(clfs)) #targetPre = target #[0:5000] #testScaled = test #trainScaled = trainBase #[0:5000] #targetPre = target #[0:5000] #testScaled = test #trainScaled = trainBase #[0:5000] print "Begin Training" lenTrainBase = len(trainBase) #lenTrainBase = len(trainBase[0:5000]) lenTest = 11573 #lenTest = len(test) gc.collect() for ExecutionIndex, clf in enumerate(clfs): print clf avg = 0 predicted_list = [] dataset_blend_test_set = np.zeros((lenTest, NumFolds)) foldCount = 0 #Stratified for classification...[trainBase[i][0] for i in range(len(trainBase))] Folds = cross_validation.KFold(lenTrainBase, k=NumFolds, indices=True) for train_index, test_index in Folds: target = [targetX[i] for i in train_index] train = [trainBase[i] for i in train_index] targetTest = [targetX[i] for i in test_index] trainTest = [trainBase[i] for i in test_index] #target = [targetPre[i] for i in train_index] #train = [trainScaled[i] for i in train_index] #targetTest = [targetPre[i] for i in test_index] #trainTest = [trainScaled[i] for i in test_index] gc.collect() print print "Iteration: ", foldCount print "LEN: ", len(train), len(target) #print train[0] #print target[0] #return print "Start", datetime.datetime.now() clf.fit(train, target) prob = clf.predict(trainTest) print "End ", datetime.datetime.now() dataset_blend_train[test_index, ExecutionIndex] = prob gc.collect() probSum = 0 weightSum = 0 # totalOffByHalf = 0 # totalPositive = 0 # totalPositiveOffByHalf = 0 # totalPositivePredictions = 0 for i in range(0, len(prob)): probX = prob[i] probX = 31100.0 print targetTest[i][0], probX probSum += math.pow(math.log10(targetTest[i][0]) - math.log10(probX), 2) #probSum += weights[test_index[i]][0] * math.fabs(targetTest[i] - probX) #weightSum += weights[test_index[i]][0] #print "Weight", weights[test_index[i]][0], "Index: ",i, "Test_Index: ",test_index[i] , "Actual: ", targetTest[i], "Predicted: ", probX # log loss cal #probSum += int(targetTest[i])*log(probX)+(1-int(targetTest[i]))*log(1-probX) # if ( math.fabs(probX - int(targetTest[i])) > 0.5 ): # totalOffByHalf = totalOffByHalf + 1 # if ( int(targetTest[i]) == 1 ): # totalPositive = totalPositive + 1 # if ( int(targetTest[i]) == 1 and probX < 0.5): # totalPositiveOffByHalf = totalPositiveOffByHalf + 1 # if (probX > 0.5): # totalPositivePredictions = totalPositivePredictions + 1 # print # print "Stats:" # print "Total Off By > 0.5 ", totalOffByHalf # print "Total Positive ", totalPositive # print "Total Positive Off By Half ", totalPositiveOffByHalf # print "Total Positive Predictions ", totalPositivePredictions #print -probSum/len(prob) print "Score: ", math.sqrt(probSum/len(prob)) avg += math.sqrt(probSum/len(prob))/NumFolds gc.collect() fo = open("test1.csv", "r") predicted_probs = [] for line in fo: line = line.strip().split(",") newRow = [] for item in line: newRow.append(float(item)) predicted_probs.append(clf.predict(newRow)) fo.close() #predicted_probs = clf.predict(testScaled) #predicted_list.append([x[1] for x in predicted_probs]) dataset_blend_test_set[:, foldCount] = predicted_probs #[0] gc.collect() foldCount = foldCount + 1 dataset_blend_test[:,ExecutionIndex] = dataset_blend_test_set.mean(1) #print "Saving NP" #np.savetxt('temp/dataset_blend_test_set.txt', dataset_blend_test_set) #np.savetxt('temp/dataset_blend_test_set.mean.txt', dataset_blend_test_set.mean(1) ) #np.savetxt('temp/dataset_blend_test.txt', dataset_blend_test) #print "Done Saving NP" now = datetime.datetime.now() #print dataset_blend_test_set.mean(1) csv_io.write_delimited_file_single("../predictions/Stack_" + now.strftime("%Y%m%d%H%M%S") + "_" + str(avg) + "_" + str(clf)[:12] + ".csv", dataset_blend_test_set.mean(1)) csv_io.write_delimited_file_single("../predictions/Target_Stack_" + now.strftime("%Y%m%d%H%M%S") + "_" + str(avg) + "_" + str(clf)[:12] + ".csv", dataset_blend_train[:,ExecutionIndex] ) csv_io.write_delimited_file("../predictions/RunLog.csv", [now.strftime("%Y %m %d %H %M %S"), "AVG." , str(avg), str(clf), "Folds:", str(NumFolds), "Model", model, "", ""], filemode="a",delimiter=",") print "------------------------Average: ", avg #np.savetxt('temp/dataset_blend_train.txt', dataset_blend_train) return dataset_blend_train, dataset_blend_test
def PreProcess3(): trainBase = csv_io.read_data("PreProcessData/training_PreProcess2.csv", split="\t", skipFirstLine=False) test = csv_io.read_data("PreProcessData/test_PreProcess2.csv", split="\t", skipFirstLine=False) weights = csv_io.read_data("PreProcessData/Weights.csv", skipFirstLine=False) print "Train Size: ", len(trainBase[0]), "Test Size: ", len(test[0]) shutil.copy2("PreProcessData/DataClassList2.csv", "PreProcessData/DataClassList3.csv") lat = len(trainBase[0]) - 2 long = len(trainBase[0]) - 1 target = [x[0] for x in trainBase] train = [x[lat:long + 1] for x in trainBase] n_neighborsArr = [5] leaf_sizeArr = [30] for n_neighbor in n_neighborsArr: for leaf_s in leaf_sizeArr: print "Training neighbors: ", n_neighbor, "leaf_size: ", leaf_s neigh = KNeighborsRegressor(n_neighbors=n_neighbor, warn_on_equidistant=False, leaf_size=leaf_s, algorithm="ball_tree", weights=myFunc) neigh.fit(train, target) probSum = 0 weightSum = 0 for index, data in enumerate(trainBase): pred = neigh.predict([data[lat], data[long]]) #print data[lat], data[long], "Prediction: ", pred[0], "Target: ", target[index] if (len(n_neighborsArr) == 1): trainBase[index].append(pred[0]) probSum += weights[index][0] * math.fabs(target[index] - pred[0]) weightSum += weights[index][0] print "Score: ", probSum / weightSum if (len(n_neighborsArr) > 1): continue for index, data in enumerate(test): pred = neigh.predict([data[lat - 1], data[long - 1]]) #print data[lat - 1], data[long - 1], "Prediction: ", pred[0] if (len(n_neighborsArr) == 1): test[index].append(pred[0]) if (len(n_neighborsArr) > 1): return with open("PreProcessData/DataClassList3.csv", "a") as myfile: myfile.write("Lat-Long-Predictor\n") print "Writing Data" csv_io.write_delimited_file("PreProcessData/training_PreProcess3.csv", trainBase, delimiter="\t") csv_io.write_delimited_file("PreProcessData/test_PreProcess3.csv", test, delimiter="\t") print "Done."
def main(): trainBase = csv_io.read_data("PreProcessData/PreProcess2.csv", False) avg = 0 NumFolds = 5 # should be odd for median predicted_list = [] spanDistance = 12 bootstrapLists = [] NeighborsArray = [10] for Neighbors in NeighborsArray: predicted_list = [] Folds = cross_validation.KFold(len(trainBase) - 1, k=NumFolds, indices=True, shuffle=False, random_state=None) for train_index, test_index in Folds: trainBaseTemp = [trainBase[i + 1] for i in train_index] #trainBaseTemp = trainBase target = [x[0] for x in trainBaseTemp] train = [x[1:] for x in trainBaseTemp] testBaseTemp = [trainBase[i + 1] for i in test_index] #testBaseTemp = trainBase targetTest = [x[0] for x in testBaseTemp] trainTest = [x[1:] for x in testBaseTemp] test = csv_io.read_data("PreProcessData/PreTestData2.csv", False) test = [x[0:] for x in test] kn = neighbors.KNeighborsClassifier(n_neighbors=Neighbors, weights='distance', algorithm='brute', leaf_size=100, warn_on_equidistant=True, p=2) kn.fit(train, target) prob = kn.predict(trainTest) prob = SimpleScale(prob) # scale output probababilities probSum = 0 totalOffByHalf = 0 totalPositive = 0 totalPositiveOffByHalf = 0 totalPositivePredictions = 0 for i in range(0, len(prob)): probX = prob[i][1] # [1] if (probX > 0.999): probX = 0.999 if (probX < 0.001): probX = 0.001 #print i, probSum, probX, targetTest[i] #print target[i]*log(probX), (1-target[i])*log(1-probX) probSum += int(targetTest[i]) * log(probX) + ( 1 - int(targetTest[i])) * log(1 - probX) if (math.fabs(probX - int(targetTest[i])) > 0.5): totalOffByHalf = totalOffByHalf + 1 if (int(targetTest[i]) == 1): totalPositive = totalPositive + 1 if (int(targetTest[i]) == 1 and probX < 0.5): totalPositiveOffByHalf = totalPositiveOffByHalf + 1 if (probX > 0.5): totalPositivePredictions = totalPositivePredictions + 1 print "Total Off By > 0.5 ", totalOffByHalf print "Total Positive ", totalPositive print "Total Positive Off By Half ", totalPositiveOffByHalf print "Total Positive Predictions ", totalPositivePredictions print "Neighbors: ", Neighbors print -probSum / len(prob) avg += (-probSum / len(prob)) / NumFolds predicted_probs = kn.predict(test) # was test prob = SimpleScale(prob) # scale output probababilities predicted_list.append([x[1] for x in predicted_probs]) avg_list = [] med_list = [] # For N folds, get the average/median for each prediction item in test set. for p in range(0, len(test)): temp_list = [] for q in range(0, len(predicted_list)): temp_list.append(predicted_list[q][p]) avg_list.append(mean(temp_list)) med_list.append(getMedian(temp_list)) #print p, q, temp_list, mean(temp_list), getMedian(temp_list) bootstrapLists.append(avg_list) # This would be used if we ran multiple runs with different training values. # Primitive stacking, should rather save data, and do formal stacking. if (len(bootstrapLists) > 1): finalList = [] for p in range(0, len(test)): temp_list = [] for q in range(0, len(bootstrapLists)): temp_list.append(bootstrapLists[q][p]) finalList.append(meanSpan(temp_list, spanDistance)) print p, q, temp_list, meanSpan(temp_list, spanDistance) else: finalList = bootstrapLists[0] avg_values = ["%f" % x for x in finalList] csv_io.write_delimited_file("../Submissions/rf2_stack_avg.csv", avg_values) print "Average: ", avg var = raw_input("Enter to terminate.")
def PreProcessRun(dataSet): print print "DataSet: ", dataSet if ( os.path.exists("PreProcessData/" + dataSet + "_PreProcess.csv") ): os.remove("PreProcessData/" + dataSet + "_PreProcess.csv") DataClassList = [] f1 = open("../" + dataSet + "/" + dataSet + "_filev1.csv", 'r') f2 = open("PreProcessData/" + dataSet + "_PreProcess_temp.csv", 'w') for line in f1: newLine = "" gotQuote = False for c in line: if ( c == "\"" and gotQuote == False ): gotQuote = True elif ( c == "\"" and gotQuote == True ): gotQuote = False if ( gotQuote == True and c == ","): continue elif(gotQuote == True): newLine += c else: if ( c == ","): newLine += "\t" else: newLine += c f2.write(newLine) f1.close() f2.close() data = csv_io.read_data("PreProcessData/" + dataSet + "_PreProcess_temp.csv", split="\t" ,skipFirstLine = False) weights = [] first = True if ( dataSet == "training"): for row in data: if ( first == True ) : first = False continue weights.append([row[13]]) #print row[13] csv_io.write_delimited_file("PreProcessData/Weights.csv", weights) data = csv_io.read_data("PreProcessData/training_PreProcess_temp.csv", split="\t" ,skipFirstLine = True) meanSum = [0.0] * 200 meanCount = [0] * 200 for index, val in enumerate(meanSum): meanCount[index] = 0 meanSum[index] = 0.0 for row in data: for index, val in enumerate(row): if ( isinstance(val, float) and val != 0.0): meanCount[index] += 1 meanSum[index] += val #else: #print "skip: ", val for index, val in enumerate(meanSum): if meanCount[index] > 0: meanSum[index] = meanSum[index]/float(meanCount[index]) data = csv_io.read_data("PreProcessData/" + dataSet + "_PreProcess_temp.csv", split="\t" ,skipFirstLine = False) SkipArr = [0,2,4,171] for index, item in enumerate(data[0]): #print item if index in SkipArr: continue if "MOE_" in item: print "MOE_", item SkipArr.append(index) continue if ( index == 170 ): #DataClassList.insert(0, item) continue else: DataClassList.append(item) continue print "Len: ", len(data[0]) first = True for item in data: #print item if ( first == True ): first = False continue rowNew = [] for index, val in enumerate(item): if index in SkipArr: continue # in training this is the target value(append to beginning ), and in test this is the weight (just skip it) if ( index == 170): #print "prepend", val if dataSet == "training": rowNew.insert(0, val) continue if ( val == "" or val == "NA" or val == "0" or val == "0.0" or val == 0 or val == 0.0): rowNew.append(meanSum[index]) elif isinstance(val, str): rowNew.append(toFloat(val.replace("$", ""))) else: rowNew.append(val) csv_io.write_delimited_file("PreProcessData/" + dataSet + "_PreProcess.csv", [copy.deepcopy(rowNew)], filemode="a", delimiter="\t") csv_io.write_delimited_file("PreProcessData/DataClassList.csv", DataClassList) print "Done."
def main(): current = strftime("%Y%m%d", gmtime()) trainfilename = os.path.join(os.path.dirname(__file__), 'data', 'train.csv') testfilename = os.path.join(os.path.dirname(__file__), 'data', 'test.csv') train_X = pd.read_csv(trainfilename) print "Basic info on training data" print len(train_X) print len(train_X.columns) print train_X.columns train_Y = train_X.take([1], axis=1) # print train_X.columns orig_test_X = pd.read_csv(testfilename) test_X = orig_test_X #Binaries train_X["Sex"] = train_X["Sex"].apply(lambda sex: 0 if sex == "male" else 1) test_X["Sex"] = test_X["Sex"].apply(lambda sex: 0 if sex == "male" else 1) #Remove train_X = train_X.drop('Embarked',1) test_X = test_X.drop('Embarked',1) train_X = train_X.drop('Ticket',1) test_X = test_X.drop('Ticket',1) train_X["Name"] = train_X["Name"].apply(lambda name: nameparser(name)) test_X["Name"] = test_X["Name"].apply(lambda name: nameparser(name)) train_X = train_X.drop('Cabin',1) test_X = test_X.drop('Cabin',1) train_X["alone"] = train_X.apply(alone, axis=1) test_X["alone"] = test_X.apply(alone, axis=1) train_X = train_X.drop('SibSp',1) test_X = test_X.drop('SibSp',1) train_X = train_X.drop('Parch',1) test_X = test_X.drop('Parch',1) for name in train_X['Name'].unique(): print "For name " + str(name) imp = Imputer(missing_values='NaN', strategy='mean', axis=1) features = pd.concat([train_X[train_X['Name'] == name]['Age'],test_X[test_X['Name'] == name]['Age']]).values imp.fit(features) features = train_X[train_X['Name'] == name]['Age'].values train_X.loc[train_X.Name == name,'Age'] = list(imp.transform(features)[0]) print np.std(train_X[train_X['Name'] == name]['Age']) print np.mean(train_X[train_X['Name'] == name]['Age']) features = test_X[test_X['Name'] == name]['Age'].values test_X.loc[test_X.Name == name,'Age'] = list(imp.transform(features)[0]) print np.std(test_X[test_X['Name'] == name]['Age']) print np.mean(test_X[test_X['Name'] == name]['Age']) train_X["woman_child_man"] = train_X.apply(lambda row: woman_child_or_man(row), axis=1) test_X["woman_child_man"] = test_X.apply(lambda row: woman_child_or_man(row), axis=1) train_X = train_X.drop('Name',1) test_X = test_X.drop('Name',1) newcolumns= ["woman_child_man"] train_one_hot_X, test_one_hot_X = one_hot_dataframe(train_X, test_X, newcolumns, replace=True) train_X = train_one_hot_X test_X = test_one_hot_X print train_X.columns print test_X.columns train_X = train_X.drop('PassengerId',1) test_X = test_X.drop('PassengerId',1) # http://triangleinequality.wordpress.com/2013/05/19/machine-learning-with-python-first-steps-munging/ #Age through Imputation performed already# # Fare # #Fare imputation may not help: see http://nbviewer.ipython.org/gist/mwaskom/8224591 train_X.Fare = train_X.Fare.map(lambda x: np.nan if x==0 else x) test_X.Fare = test_X.Fare.map(lambda x: np.nan if x==0 else x) classmeans = pd.concat([train_X, test_X]).pivot_table('Fare', rows='Pclass', aggfunc='median') # classmeans = test_X.pivot_table('Fare', rows='Pclass', aggfunc='mean') train_X.Fare = train_X[['Fare', 'Pclass']].apply(lambda x: classmeans[x['Pclass']] if pd.isnull(x['Fare']) else x['Fare'], axis=1 ) test_X.Fare = test_X[['Fare', 'Pclass']].apply(lambda x: classmeans[x['Pclass']] if pd.isnull(x['Fare']) else x['Fare'], axis=1 ) train_X.to_csv(os.path.join(os.path.dirname(__file__), 'data', "simple_processed_train_data_est_{0}.csv".format(current))) test_X.to_csv(os.path.join(os.path.dirname(__file__), 'data', "simple_processed_test_data_est_{0}.csv".format(current))) train_X = train_X.drop('Survived',1) print "Finished reading data" train = train_X.values.astype(np.float) target = np.ravel(train_Y.values.astype(np.float)) # random forest code forest = RandomForestClassifier(n_estimators = 100) forest = forest.fit(train, target) if True: from sklearn import cross_validation scores = cross_validation.cross_val_score(forest, train, target, cv=10) print scores if False: from sklearn.grid_search import GridSearchCV forest = ExtraTreesClassifier(bootstrap=True,oob_score=True,random_state=42) max_features_choices = [n * 0.1 for n in range(1,10)] n_ests=[100, 200, 500, 1000] gs = GridSearchCV(forest, {'max_features': max_features_choices,'n_estimators': n_ests}, scoring = 'accuracy', cv = 10, n_jobs=-1) gs.fit(train, target) print "Score {0} with params {1}".format(gs.best_score_, gs.best_params_) print('fitting the model') forest = ExtraTreesClassifier(**gs.best_params_) forest.fit(train, target) # run model against test data predicted_class = forest.predict(test_X.values.astype(np.float)) # print predicted_class[0:9] # print(len(predicted_class)) predicted_class = ["%d,%d" % (orig_test_X.values[i,0], predicted_class[i]) for i in range(len(predicted_class))] csv_io.write_delimited_file(os.path.join(os.path.dirname(__file__), 'results', "simple_random_forest_solution_est_{0}.csv".format(current)), predicted_class, header=['PassengerId', 'Survived']) print ('Finished. Exiting.')
def main(): startCol = 0 endCol = 50 # max = 1775 train = csv_io.read_data("../Data/train.csv") target = [x[0] for x in train][1:3000] targetTest = [x[0] for x in train][3001:] trainTest = [x[startCol+1:endCol+1] for x in train][3001:] test = csv_io.read_data("../Data/test.csv") test = [x[startCol:endCol] for x in test] train = [x[startCol+1:endCol+1] for x in train][1:3000] fo = open("knn_stats.txt", "a+") rf = RidgeClassifier(alpha=0.01, fit_intercept=True, normalize=False, copy_X=True, tol=0.001) rf.fit(train, target) prob = rf.predict(trainTest) # changed from test result = 100 probSum = 0 for i in range(0, len(prob)): probX = prob[i] # [1] if ( probX > 0.7): probX = 0.7; if ( probX < 0.3): probX = 0.3; print i, probSum, probX, target[i] print target[i]*log(probX), (1-target[i])*log(1-probX) probSum += targetTest[i]*log(probX)+(1-targetTest[i])*log(1-probX) #print probSum #print len(prob) #print "C: ", 10**C, " gamma: " ,2**g print -probSum/len(prob) if ( -probSum/len(prob) < result ): result = -probSum/len(prob) predicted_probs = rf.predict(test) # was test predicted_probs = ["%f" % x for x in predicted_probs] csv_io.write_delimited_file("../Submissions/knn.csv", predicted_probs) print "Generated Data!!" #fo.write(str(5) + str(5)+ str(5)); fo.close() #csv_io.write_delimited_file("../Submissions/rf_benchmark_test2.csv", predicted_probs) #predicted_probs = rf.predict_proba(train) # changed from test #predicted_probs = ["%f" % x[1] for x in predicted_probs] #predicted_probs = rf.predict(train) # changed from test #predicted_probs = ["%f" % x for x in predicted_probs] #csv_io.write_delimited_file("../Submissions/rf_benchmark_train2.csv", predicted_probs) var = raw_input("Enter to terminate.")
def main(): startCol = 0 endCol = 1775 # max = 1775 train = csv_io.read_data("../Data/train.csv") target = [x[0] for x in train][1:3000] targetTest = [x[0] for x in train][3001:] trainTest = [x[startCol+1:endCol+1] for x in train][3001:] test = csv_io.read_data("../Data/test.csv") test = [x[startCol:endCol] for x in test] train = [x[startCol+1:endCol+1] for x in train][1:3000] fo = open("knn_stats.txt", "a+") #n_neighbors=15, weights='distance' return 0.65 #n_neighbors=3, weights='distance' 0.60 rf = neighbors.KNeighborsClassifier(n_neighbors=3, weights='distance', algorithm='brute', leaf_size=100, warn_on_equidistant=True, p=2) # 'distance' rf.fit(train, target) prob = rf.predict(trainTest) # changed from test result = 100 probSum = 0 for i in range(0, len(prob)): probX = prob[i] # [1] if ( probX > 0.9): probX = 0.9; if ( probX < 0.1): probX = 0.1; print i, probSum, probX, target[i] print target[i]*log(probX), (1-target[i])*log(1-probX) probSum += targetTest[i]*log(probX)+(1-targetTest[i])*log(1-probX) #print probSum #print len(prob) #print "C: ", 10**C, " gamma: " ,2**g print -probSum/len(prob) if ( -probSum/len(prob) < result ): result = -probSum/len(prob) predicted_probs = rf.predict(test) # was test predicted_probs = ["%f" % x for x in predicted_probs] csv_io.write_delimited_file("../Submissions/knn.csv", predicted_probs) print "Generated Data!!" #fo.write(str(5) + str(5)+ str(5)); fo.close() #csv_io.write_delimited_file("../Submissions/rf_benchmark_test2.csv", predicted_probs) #predicted_probs = rf.predict_proba(train) # changed from test #predicted_probs = ["%f" % x[1] for x in predicted_probs] #predicted_probs = rf.predict(train) # changed from test #predicted_probs = ["%f" % x for x in predicted_probs] #csv_io.write_delimited_file("../Submissions/rf_benchmark_train2.csv", predicted_probs) var = raw_input("Enter to terminate.")
def main(): trainfilename = os.path.join(os.path.dirname(__file__), 'data', 'train.csv') testfilename = os.path.join(os.path.dirname(__file__), 'data', 'test.csv') train_X = pd.read_csv(trainfilename) print "Basic info on training data" print len(train_X) print len(train_X.columns) print train_X.columns train_Y = train_X.take([1], axis=1) train_X = train_X.drop('Survived',1) # print train_X.columns test_X = pd.read_csv(testfilename) #Binaries train_X["has_family"] = train_X.apply(family, axis=1) train_X["child"] = train_X.apply(child, axis=1) train_X["smallchild"] = train_X.apply(smallchild, axis=1) train_X["familysize"] = train_X.apply(familysize, axis=1) train_X["Sex"] = train_X["Sex"].apply(lambda sex: 0 if sex == "male" else 1) test_X["has_family"] = test_X.apply(family, axis=1) test_X["child"] = test_X.apply(child, axis=1) test_X["smallchild"] = test_X.apply(smallchild, axis=1) test_X["familysize"] = test_X.apply(familysize, axis=1) test_X["Sex"] = test_X["Sex"].apply(lambda sex: 0 if sex == "male" else 1) #Categorical ==> Use one hot necoding onehot = True if onehot: newcolumns= [] train_X["Embarked"] = train_X["Embarked"].apply(lambda port: selectembarkment(port)) test_X["Embarked"] = test_X["Embarked"].apply(lambda port: selectembarkment(port)) newcolumns.append("Embarked") train_X["fare2"] = train_X.apply(fare2, axis=1) test_X["fare2"] = test_X.apply(fare2, axis=1) newcolumns.append("fare2") train_X["nameinfo"] = train_X["Name"].apply(lambda name: nameinfo(name)) test_X["nameinfo"] = test_X["Name"].apply(lambda name: nameinfo(name)) newcolumns.append("nameinfo") train_X["Ticket"] = train_X["Ticket"].apply(lambda ticket: DeptCode(ticket)) test_X["Ticket"] = test_X["Ticket"].apply(lambda ticket: DeptCode(ticket)) newcolumns.append("Ticket") train_X["Name"] = train_X["Name"].apply(lambda name: nameparser(name)) test_X["Name"] = test_X["Name"].apply(lambda name: nameparser(name)) newcolumns.append("Name") train_X["Cabin"] = train_X["Cabin"].apply(lambda cabin: cabinparser(cabin)) test_X["Cabin"] = test_X["Cabin"].apply(lambda cabin: cabinparser(cabin)) newcolumns.append("Cabin") train_one_hot_X, test_one_hot_X = one_hot_dataframe(train_X, test_X, newcolumns, replace=True) train_X = train_one_hot_X test_X = test_one_hot_X else: train_X["Embarked"] = train_X["Embarked"].apply(lambda port: selectembarkment(port)) train_X["fare2"] = train_X["Fare"].apply(fare2, axis=1) train_X["nameinfo"] = train_X["Name"].apply(lambda name: nameinfo(name), axis=1) train_X["Ticket"] = train_X["Ticket"].apply(lambda ticket: DeptCode(ticket)) train_X["Name"] = train_X["Name"].apply(lambda name: nameparser(name)) train_X["Cabin"] = train_X["Cabin"].apply(lambda cabin: cabinparser(cabin)) test_X["Embarked"] = test_X["Embarked"].apply(lambda port: selectembarkment(port)) test_X["fare2"] = test_X["Fare"].apply(fare2, axis=1) test_X["nameinfo"] = test_X["Name"].apply(lambda name: nameinfo(name), axis=1) test_X["Ticket"] = test_X["Ticket"].apply(lambda ticket: DeptCode(ticket)) test_X["Name"] = test_X["Name"].apply(lambda name: nameparser(name)) test_X["Cabin"] = test_X["Cabin"].apply(lambda cabin: cabinparser(cabin)) imp = Imputer(missing_values='NaN', strategy='mean', axis=0) imp.fit_transform(train_X['Age'], test_X['Age']) print "Finished reading data" train_X = train_X.fillna(-1) test_X = test_X.fillna(-1) # print train_X if False: for column in train_X.columns: print column, train_X[column] train_X = train_X.values.astype(np.float) test_X = test_X.values.astype(np.float) target = np.ravel(train_Y.values.astype(np.float)) trees = ExtraTreesClassifier(n_estimators=100,bootstrap=True,oob_score=True) trees.fit(train_X, target) pd.DataFrame(trees.feature_importances_).plot(kind='bar') selected_features = np.where(trees.feature_importances_ > 0.02)[0] #0.005)[0] #0.005 #[ 0 1 2 3 4 5 6 7 8 9 10 11 12 13 15] #0.01 #[ 0 1 3 4 5 6 7 8 9 10 11 12 13 15] #0.02 #[ 0 1 3 4 5 6 7 8 9 10 12 13 15] #0.05 # [0 1 3 4 8] print selected_features train_selected_X = train_X[:, selected_features] test_selected_X = test_X[:, selected_features] # random forest code clf = svm.SVC() kernels = ['poly'] #, 'rbf', 'sigmoid'] degs=[2,3] # gammas = [1e-4, 3e-4, 1e-3, 3e-3, 1e-2, 3e-2, 1e-1, 3e-1, 1., 3., 10.] gammas = [1e-3]#, 1e-3, 1e-1, 1.] gs = GridSearchCV(clf, {'kernel': kernels,'degree': degs, 'gamma': gammas}, scoring = 'accuracy', cv = 10) gs.fit(train_selected_X, target) print "Score {0} with params {1}".format(gs.best_score_, gs.best_params_) print('fitting the model') clf = svm.SVC(**gs.best_params_) clf.fit(train_selected_X, target) # run model against test data predicted_class = clf.predict(test_selected_X) # print predicted_class[0:9] # print(len(predicted_class)) predicted_class = ["%d,%d" % (test_selected_X[i,0], predicted_class[i]) for i in range(len(predicted_class))] current = strftime("%Y%m%d", gmtime()) csv_io.write_delimited_file(os.path.join(os.path.dirname(__file__), 'results', "svm_0.02_solution_est_{0}.csv".format(current)), predicted_class, header=['PassengerId', 'Survived']) #0.02 #Trying sig, rbg, poly with degree 3 on 1e-3 and 1e-4 # Score 0.760942760943 with params {'kernel': 'poly', 'gamma': 0.001, 'degree': 3} all kernels degree 3 # real 408m13.291s #Score on kaggle 0.74163 # all polynomial 1-4 # Score 0.772166105499 with params {'kernel': 'poly', 'gamma': 0.001, 'degree': 2} # real 1077m38.691s #Score on kaggle 0.76555 #0.05 for comparison # all polynomial 1-4 # Score 0.763187429854 with params {'kernel': 'poly', 'gamma': 0.001, 'degree': 3} # real 340m42.558s # Your submission scored 0.74641, #Fixed features and implemented one hot coding # 0.02 polynomial 2 and 3 with 1e-3 # real 283m12.476s # Score 0.772166105499 with params {'kernel': 'poly', 'gamma': 0.001, 'degree': 2} # our submission scored 0.75598, print ('Finished. Exiting.')
def PreProcess5(): #note, 275 represents too much data, and the scaler fails with an exception. trainBase = csv_io.read_data("PreProcessData/training_PreProcess5_250.csv", skipFirstLine=False, split="\t") test = csv_io.read_data("PreProcessData/test_PreProcess5_250.csv", skipFirstLine=False, split="\t") #shutil.copy2("PreProcessData/DataClassList5.csv", "PreProcessData/DataClassList6.csv") target = [x[0] for x in trainBase] train = [x[1:] for x in trainBase] DataClassList = csv_io.read_data( "PreProcessData/DataClassList_Importances_250.csv", False) print "Data len: ", len(train[0]) print "DataClassList len: ", len(DataClassList) #return # this seems about optimal, but has not been tuned on latest improvements. NumFeatures = 40 # NOTE going from 30 to 20 features on KNN5 set has almost no effect. Down to 15 is significant loss. # for GBM at 6 and 400 30 is 3.01 and 30 3.05. print "Scaling" targetPre = [x[0] for x in trainBase][0:10000] print "Scaling1" trainPre = [x[1:] for x in trainBase][0:10000] #testPre = [x[0:] for x in test] print "Scaling2" scaler = preprocessing.Scaler().fit(trainPre) print "Scaling3" trainScaled = scaler.transform(trainPre) #testScaled = scaler.transform(testPre) #clf = RandomForestRegressor(n_estimators=25, n_jobs=1,compute_importances=True) #gc.collect() print "Prep Classes" # prep for usage below... DataClassListTemp = [] for DataIndex, DataClass in enumerate(DataClassList): DataClassListTemp.append([DataClass[0], 0]) DataClassList = DataClassListTemp reduceBy = 5 totalFeatures = len(trainPre[0]) trainNew = [] testNew = [] print "Processing" while (totalFeatures > NumFeatures): if (totalFeatures - NumFeatures < 40): reduceBy = 3 if (totalFeatures - NumFeatures < 20): reduceBy = 2 if (totalFeatures - NumFeatures < 10): reduceBy = 1 if (totalFeatures - NumFeatures < reduceBy): reduceBy = totalFeatures - NumFeatures print "Reduce Features: ", reduceBy print "Training" clf = GradientBoostingRegressor(loss='ls', learn_rate=0.05, subsample=0.5, max_depth=6, n_estimators=400, random_state=166, min_samples_leaf=30) clf.fit(trainScaled, targetPre) print "Computing Importances" importances = clf.feature_importances_ #print importances importancesSorted = sorted(importances, reverse=True) #print importancesSorted threshold = importancesSorted[len(importancesSorted) - reduceBy] print threshold #trainScaled = clf.transform(trainScaled, threshold) # only exists in RF trainScaledNew = [] for row in trainScaled: newRow = [] for impIndex, importance in enumerate(importances): if (importance > threshold): newRow.append(row[impIndex]) trainScaledNew.append(newRow) trainScaled = trainScaledNew print "Cols:", len(trainScaled) print "Rows:", len(trainScaled[0]) totalFeatures = totalFeatures - reduceBy print "Total Features:", totalFeatures trainNew = [] testNew = [] for row in train: newRow = [] for impIndex, importance in enumerate(importances): if (importance > threshold): newRow.append(row[impIndex]) trainNew.append(newRow) train = trainNew for row in test: newRow = [] for impIndex, importance in enumerate(importances): if (importance > threshold): newRow.append(row[impIndex]) testNew.append(newRow) test = testNew print "Train Cols:", len(train) print "Train Rows:", len(train[0]) print "Test Cols:", len(test) print "Test Rows:", len(test[0]) DataClassListNew = [] for Index, importance in enumerate(importances): if (importance > threshold): print DataClassList[Index][0], importance DataClassListNew.append([DataClassList[Index][0], importance]) DataClassList = DataClassListNew print "Data Transform Complete" # final steps, save data classes in new set csv_io.write_delimited_file( "PreProcessData/DataClassList_Importances_RFE2_" + str(NumFeatures) + ".csv", DataClassListNew) DataClassListNew_temp = sorted(DataClassListNew, key=operator.itemgetter(1), reverse=True) csv_io.write_delimited_file( "PreProcessData/DataClassList_Importances_RFE2_sorted_" + str(NumFeatures) + ".csv", DataClassListNew_temp) # prepend the target on each row. trainFinal = [] rowIndex = 0 for row in train: newRow = [] for Index, val in enumerate(row): if (Index == 0): newRow.append(target[rowIndex]) newRow.append(val) trainFinal.append(newRow) rowIndex += 1 csv_io.write_delimited_file("PreProcessData/training_PreProcess6_RFE2_" + str(NumFeatures) + ".csv", trainFinal, delimiter="\t") csv_io.write_delimited_file("PreProcessData/test_PreProcess6_RFE2_" + str(NumFeatures) + ".csv", testNew, delimiter="\t")
def PreProcess2(): trainBase = csv_io.read_data("PreProcessData/training_PreProcess2_temp.csv", False) test = csv_io.read_data("PreProcessData/test_PreProcess2_temp.csv", False) target = [x[0] for x in trainBase] train = [x[1:] for x in trainBase] IndexList = [2,3,4,5,6] with open("PreProcessData/DataClassList.csv", "a") as myfile: myfile.write("\n") DataClassList = csv_io.read_data("PreProcessData/DataClassList.csv", False) #for myIndex in IndexList: for myIndex in range(2,75): #for myIndex in range(2,len(train[0]) - 2): MTrain = [] FTrain = [] MTarget = [] FTarget = [] for index, data in enumerate(train): if ( data[0] == "0" ): MTrain.append([data[1], data[myIndex]]) MTarget.append(target[index]) #print "M", data[1], data[myIndex] if ( data[0] == "1" ): FTrain.append([data[1], data[myIndex]]) FTarget.append(target[index]) #print "F", data[1], data[myIndex] #print MTrain print len(MTrain), len(FTrain),len(MTarget), len(FTarget) # better than GradBoost, and much better than KNN Mneigh = RandomForestClassifier() Fneigh = RandomForestClassifier() Mneigh.fit(MTrain, MTarget) Fneigh.fit(FTrain, FTarget) #count = 0 for index, data in enumerate(train): if ( data[0] == "0" ): pred = Mneigh.predict_proba([data[1], data[myIndex]]) #print "M", data[1], data[myIndex], pred[0][1], target[index] trainBase[index].append(pred[0][1]) #if ( str(pred[0][1]) == str(target[index])): # count = count + 1 if ( data[0] == "1" ): pred = Fneigh.predict_proba([data[1], data[myIndex]]) #print "F", data[1], data[myIndex], pred[0][1], target[index] trainBase[index].append(pred[0][1]) #if ( str(pred[0][1]) == str(target[index])): # count = count + 1 for index, data in enumerate(test): if ( data[0] == "0" ): pred = Mneigh.predict_proba([data[1], data[myIndex]]) #print "M", data[1], data[myIndex], pred[0][1], target[index] test[index].append(pred[0][1]) if ( data[0] == "1" ): pred = Fneigh.predict_proba([data[1], data[myIndex]]) #print "F", data[1], data[myIndex], pred[0][1], target[index] test[index].append(pred[0][1]) print myIndex, len(train[0]) with open("PreProcessData/DataClassList.csv", "a") as myfile: myfile.write("RF_Gender-Age-Class_" + str(DataClassList[myIndex][0]) + "_" + str(myIndex) + "\n") print "Writing Data" csv_io.write_delimited_file("PreProcessData/training_PreProcess2_temp_a.csv", trainBase) csv_io.write_delimited_file("PreProcessData/test_PreProcess2_temp_a.csv", test) print "Done."
def main(): #random.seed(5) #random.random() startCol = 0 endCol = 1775 # max = 1775 trainBase = csv_io.read_data("../Data/train.csv") result = 100 avg = 0 bootstraps = 5 # should be odd for median rnd_start = 456 predicted_list = [] spanDistance = 12 bootstrapLists = [] # this feature set got 0.45, which is no improvement over a single rf run. #for m_features in [52,56,60,66,72,80,90,100,110,120,130,140,150,160,170,180,190,200,220,240,260,280,300,350,400,450,500,550,600,650]: for m_features in [0]: predicted_list = [] #bs = cross_validation.Bootstrap(len(trainBase) - 1, n_bootstraps=bootstraps, train_size=0.7, random_state=0) bs = cross_validation.KFold(len(trainBase) - 1, k=5, indices=True, shuffle=False, random_state=None) for train_index, test_index in bs: trainBaseTemp = [trainBase[i+1] for i in train_index] #trainBaseTemp = trainBase target = [x[0] for x in trainBaseTemp]#[1001:3700] train = [x[1:] for x in trainBaseTemp]#[1001:3700] testBaseTemp = [trainBase[i+1] for i in test_index] #testBaseTemp = trainBase targetTest = [x[0] for x in testBaseTemp]#[1:1000] trainTest = [x[1:] for x in testBaseTemp]#[1:1000] test = csv_io.read_data("../Data/test.csv") test = [x[0:] for x in test] fo = open("rf_stats.txt", "a+") #rf = RandomForestClassifier(n_estimators=200, min_density=0.2, criterion="gini", random_state=rnd_start, max_features=m_features) # , max_features=None rf = RandomForestClassifier(n_estimators=200, criterion='gini', max_depth=None, min_samples_split=1, min_samples_leaf=1, min_density=0.10000000000000001, max_features='auto', bootstrap=True, compute_importances=False, oob_score=False, n_jobs=1, random_state=rnd_start, verbose=0) # , max_features=None rf.fit(train, target) prob = rf.predict_proba(trainTest) # was test probSum = 0 totalOffByHalf = 0 for i in range(0, len(prob)): probX = prob[i][1] # [1] if ( probX > 0.999999999999): probX = 0.999999999999; if ( probX < 0.000000000001): probX = 0.000000000001; #print i, probSum, probX, target[i] #print target[i]*log(probX), (1-target[i])*log(1-probX) probSum += targetTest[i]*log(probX)+(1-targetTest[i])*log(1-probX) if ( math.fabs(probX - targetTest[i]) > 0.5 ): totalOffByHalf = totalOffByHalf + 1 print "Total Off By > 0.5 ", totalOffByHalf print "M-features: ", m_features print -probSum/len(prob) #fo.write(str(C) + "," + str(g) + "," + str(-probSum/len(prob))); avg += (-probSum/len(prob))/bootstraps predicted_probs = rf.predict_proba(test) # was test predicted_list.append([x[1] for x in predicted_probs]) fo.close() avg_list = [] med_list = [] for p in range(0, len(test)): temp_list =[] for q in range(0, len(predicted_list)): temp_list.append( predicted_list[q][p]) avg_list.append( mean(temp_list) ) med_list.append( getMedian(temp_list) ) #print p, q, temp_list, mean(temp_list), getMedian(temp_list) bootstrapLists.append(avg_list) if ( len(bootstrapLists) > 1 ): finalList = [] for p in range(0, len(test)): temp_list =[] for q in range(0, len(bootstrapLists)): temp_list.append( bootstrapLists[q][p]) finalList.append( meanSpan(temp_list, spanDistance) ) print p, q, temp_list, meanSpan(temp_list, spanDistance) else: finalList = bootstrapLists[0] avg_values = ["%f" % x for x in finalList] csv_io.write_delimited_file("../Submissions/rf2_stack_avg_benchmark.csv", avg_values) print "Average: ", avg var = raw_input("Enter to terminate.")
def run_rf(SEED): target = pd.read_csv('../data/pre_shuffled_target.csv') target = np.ravel(target.values) weights = pd.read_csv('../data/weights.csv') weights = np.ravel(weights.values) trainBase = pd.read_csv('../data/pre_shuffled_train.csv') test = pd.read_csv('../data/pre_shuffled_test.csv') NumFeatures = 30 clf = RandomForestRegressor(n_estimators=30, criterion='mse', max_depth=None, min_samples_split=2, min_samples_leaf=1, max_features='auto', bootstrap=True, oob_score=False, n_jobs=1, random_state=None, verbose=0, min_density=None, compute_importances=True) #clf = ExtraTreesRegressor(n_estimators=150, min_density=0.02, n_jobs=1, criterion='gini', bootstrap=True,compute_importances=True) print("Training") clf.fit(trainBase.values, target, sample_weight=weights) print("Computing Importances") importances = clf.feature_importances_ print(importances) importancesSorted = sorted(importances, reverse=True) print(str(len(importancesSorted)) + " importances") threshold = 1.0 if (len(importancesSorted) > NumFeatures): threshold = importancesSorted[NumFeatures] print("Threshold: " + str(threshold)) DataClassListNew = [] print(trainBase.columns.values) for DataIndex, DataClass in enumerate(trainBase.columns.values): print( str(DataIndex) + " " + DataClass + ", " + str(importances[DataIndex])) DataClassListNew.append([DataClass, importances[DataIndex]]) if (importances[DataIndex] < threshold and DataClass != "id" and DataClass != "var11"): # don't drop id or weights column. trainBase.drop([DataClass], axis=1, inplace=True) test.drop([DataClass], axis=1, inplace=True) csv_io.write_delimited_file( "../preprocessdata/DataClassList_Importances_RF.csv", DataClassListNew) submission = pd.DataFrame(trainBase) submission.to_csv("../data/pre_rf_train.csv", index=False) submission = pd.DataFrame(test) submission.to_csv("../data/pre_rf_test.csv", index=False)
def main(): trainBase = csv_io.read_data("PreProcessData/PreProcess2.csv", False) avg = 0 NumFolds = 5 # should be odd for median predicted_list = [] spanDistance = 12 bootstrapLists = [] CgList = [[0.0, -5.5]] for Cg in CgList: predicted_list = [] Folds = cross_validation.KFold(len(trainBase) - 1, k=NumFolds, indices=True, shuffle=False, random_state=None) for train_index, test_index in Folds: trainBaseTemp = [trainBase[i+1] for i in train_index] #trainBaseTemp = trainBase target = [x[0] for x in trainBaseTemp] train = [x[1:] for x in trainBaseTemp] testBaseTemp = [trainBase[i+1] for i in test_index] #testBaseTemp = trainBase targetTest = [x[0] for x in testBaseTemp] trainTest = [x[1:] for x in testBaseTemp] test = csv_io.read_data("PreProcessData/PreTestData2.csv", False) test = [x[0:] for x in test] svc = svm.SVC(probability=True, C=10**Cg[0], gamma=2**Cg[1], cache_size=800, coef0=0.0, degree=3, kernel='rbf', shrinking=True, tol=0.001) svc.fit(train, target) prob = svc.predict_proba(trainTest) prob = SimpleScale(prob) # scale output probababilities probSum = 0 totalOffByHalf = 0 totalPositive = 0 totalPositiveOffByHalf = 0 totalPositivePredictions = 0 for i in range(0, len(prob)): probX = prob[i][1] # [1] if ( probX > 0.999): probX = 0.999; if ( probX < 0.001): probX = 0.001; #print i, probSum, probX, targetTest[i] #print target[i]*log(probX), (1-target[i])*log(1-probX) probSum += int(targetTest[i])*log(probX)+(1-int(targetTest[i]))*log(1-probX) if ( math.fabs(probX - int(targetTest[i])) > 0.5 ): totalOffByHalf = totalOffByHalf + 1 if ( int(targetTest[i]) == 1 ): totalPositive = totalPositive + 1 if ( int(targetTest[i]) == 1 and probX < 0.5): totalPositiveOffByHalf = totalPositiveOffByHalf + 1 if (probX > 0.5): totalPositivePredictions = totalPositivePredictions + 1 print "Total Off By > 0.5 ", totalOffByHalf print "Total Positive ", totalPositive print "Total Positive Off By Half ", totalPositiveOffByHalf print "Total Positive Predictions ", totalPositivePredictions print "C/g: ", Cg[0], Cg[1] print -probSum/len(prob) avg += (-probSum/len(prob))/NumFolds predicted_probs = svc.predict_proba(test) # was test prob = SimpleScale(prob) # scale output probababilities predicted_list.append([x[1] for x in predicted_probs]) avg_list = [] med_list = [] # For N folds, get the average/median for each prediction item in test set. for p in range(0, len(test)): temp_list =[] for q in range(0, len(predicted_list)): temp_list.append( predicted_list[q][p]) avg_list.append( mean(temp_list) ) med_list.append( getMedian(temp_list) ) #print p, q, temp_list, mean(temp_list), getMedian(temp_list) bootstrapLists.append(avg_list) # This would be used if we ran multiple runs with different training values. # Primitive stacking, should rather save data, and do formal stacking. if ( len(bootstrapLists) > 1 ): finalList = [] for p in range(0, len(test)): temp_list =[] for q in range(0, len(bootstrapLists)): temp_list.append( bootstrapLists[q][p]) finalList.append( meanSpan(temp_list, spanDistance) ) print p, q, temp_list, meanSpan(temp_list, spanDistance) else: finalList = bootstrapLists[0] avg_values = ["%f" % x for x in finalList] csv_io.write_delimited_file("../Submissions/rf2_stack_avg.csv", avg_values) print "Average: ", avg var = raw_input("Enter to terminate.")
def run_stack(SEED): model = "Lasso" lossThreshold = 0.38 trainBaseTarget = pd.read_csv('../preprocessdata/pre_shuffled_target.csv') trainBaseOrig = pd.read_csv('../models/' + model + '_train.csv') trainBaseWeight = trainBaseOrig['var11'] testOrig = pd.read_csv('../models/' + model + '_test.csv') targetBase = np.nan_to_num(np.array(trainBaseTarget)) trainBaseID = trainBaseOrig['id'] testID = testOrig['id'] avg = 0 NumFolds = 5 stackFiles = [] for filename in os.listdir("../predictions"): parts = filename.split("_") if (filename[0:5] == "Stack" and float(parts[2]) > lossThreshold): stackFiles.append(filename) trainBase = np.zeros((len(trainBaseOrig), len(stackFiles))) test = np.zeros((len(testOrig), len(stackFiles))) print("Loading Data") for fileNum, file in enumerate(stackFiles): print(file) trn = csv_io.read_data( "../predictions/Target_" + file, split=",", skipFirstLine=True) # skip first because of header. for row, datum in enumerate(trn): trainBase[row, fileNum] = datum[1] # -1 because we skil tst = csv_io.read_data( "../predictions/" + file, split=",", skipFirstLine=True) # skip first because of header. for row, datum in enumerate(tst): test[row, fileNum] = datum[1] np.savetxt('temp/dataset_blend_train.txt', trainBase) np.savetxt('temp/dataset_blend_test.txt', test) print("Num file processed: " + " " + str(len(stackFiles)) + " " + "Threshold: " + str(lossThreshold)) print("Starting Scale") allVals = np.vstack((trainBase, test)) scl = StandardScaler(copy=True, with_mean=True, with_std=True) scl.fit(allVals) # should fit on the combined sets. trainBase = scl.transform(trainBase) test = scl.transform(test) print("Starting Blend") clfs = [ #GradientBoostingRegressor(loss='ls', learning_rate=0.05, subsample=0.5, max_depth=10, n_estimators=30, random_state=166, min_samples_leaf=1), Lasso(alpha=0.000016681005372000593), #Ridge(), #LinearRegression(fit_intercept=True, normalize=False, copy_X=True) ] print("Data size: " + str(len(trainBase)) + " " + str(len(test))) dataset_blend_train = np.zeros((len(trainBase), len(clfs))) dataset_blend_test = np.zeros((len(test), len(clfs))) print("Begin Training") lenTrainBase = len(trainBase) lenTest = len(test) gc.collect() for ExecutionIndex, clf in enumerate(clfs): print(clf) avg = 0 dataset_blend_test_set = np.zeros((lenTest, NumFolds)) foldCount = 0 Folds = cross_validation.KFold(lenTrainBase, n_folds=NumFolds, indices=True) for train_index, test_index in Folds: print() print("Iteration: " + str(foldCount)) now = datetime.datetime.now() print(now.strftime("%Y/%m/%d %H:%M:%S")) target = [targetBase[i] for i in train_index] train = [trainBase[i] for i in train_index] weight = [trainBaseWeight[i] for i in train_index] targetTest = [targetBase[i] for i in test_index] trainTest = [trainBase[i] for i in test_index] weightTest = [trainBaseWeight[i] for i in test_index] #print "LEN: ", len(train), len(target) target = np.array(np.reshape(target, (-1, 1))) #train = np.array(np.reshape(train, (-1, 1)) ) weight = np.array(np.reshape(weight, (-1, 1))) targetTest = np.array(np.reshape(targetTest, (-1, 1))) #trainTest = np.array(np.reshape(trainTest, (-1, 1)) ) weightTest = np.array(np.reshape(weightTest, (-1, 1))) #clf.fit(train, target, sample_weight = weight clf.fit(train, target) predicted = clf.predict(trainTest) #print(predicted[:,0]) print(predicted) dataset_blend_train[ test_index, ExecutionIndex] = predicted #[:,0] #needed for Ridge #print(targetTest.shape) #print(prpredictedob.shape) #print(weightTest.shape) print( str( score.normalized_weighted_gini(targetTest.ravel(), predicted.ravel(), weightTest.ravel()))) avg += score.normalized_weighted_gini( targetTest.ravel(), predicted.ravel(), weightTest.ravel()) / NumFolds #print(str(score.normalized_weighted_gini(targetTest.ravel(), predicted[:,0], weightTest.ravel()))) #avg += score.normalized_weighted_gini(targetTest.ravel(), predicted[:,0], weightTest.ravel())/NumFolds predicted = clf.predict(test) dataset_blend_test_set[:, foldCount] = predicted #[:,0] foldCount = foldCount + 1 #break dataset_blend_test[:, ExecutionIndex] = dataset_blend_test_set.mean(1) now = datetime.datetime.now() #print dataset_blend_test_set.mean(1) #csv_io.write_delimited_file_single("../predictions/Stack_" + now.strftime("%Y%m%d%H%M%S") + "_" + str(avg) + "_" + str(clf)[:12] + ".csv", dataset_blend_test_set.mean(1)) submission = pd.DataFrame(np.zeros((len(testID), 2)), columns=['id', 'target']) submission['target'] = dataset_blend_test[:, ExecutionIndex] submission['id'] = testID submission.to_csv("../submission/Blend_" + now.strftime("%Y%m%d%H%M%S") + "_" + str(avg) + "_" + str(clf)[:12] + ".csv", index=False) #csv_io.write_delimited_file_single("../predictions/Target_Stack_" + now.strftime("%Y%m%d%H%M%S") + "_" + str(avg) + "_" + str(clf)[:12] + ".csv", dataset_blend_train[:,ExecutionIndex] ) submission = pd.DataFrame(np.zeros((len(trainBaseID), 2)), columns=['id', 'target']) submission['target'] = dataset_blend_train[:, ExecutionIndex] submission['id'] = trainBaseID submission.to_csv("../submission/Target_Blend_" + now.strftime("%Y%m%d%H%M%S") + "_" + str(avg) + "_" + str(clf)[:12] + ".csv", index=False) csv_io.write_delimited_file("../log/RunLogBlend.csv", [ now.strftime("%Y %m %d %H %M %S"), "AVG.", str(avg), str(clf), "Folds:", str(NumFolds), "Model", "Blend", "Stacks: ", stackFiles ], filemode="a", delimiter=",") print("------------------------Average: " + str(avg)) #np.savetxt('temp/dataset_blend_train.txt', dataset_blend_train) return dataset_blend_train, dataset_blend_test
def preprocess(): train, test = util.get_train_test_df() columns = set(train.columns) #columns.remove("SalesID") #columns.remove("SalePrice") #columns.remove("saledate") #train_fea = get_date_dataframe(train["saledate"]) #test_fea = get_date_dataframe(test["saledate"]) #parseColumns = ["UsageBand"] parseColumns = [ "UsageBand","fiBaseModel","fiModelSeries","fiModelDescriptor","ProductSize","ProductGroup","Drive_System","Enclosure","Forks","Pad_Type","Ride_Control","Stick","Transmission","Turbocharged","Blade_Extension","Blade_Width","Enclosure_Type","Engine_Horsepower","Hydraulics","Pushblock","Ripper","Scarifier","Tip_ControlCoupler","Coupler_System","Grouser_Tracks","Hydraulics_Flow","Track_Type","Thumb","Pattern_Changer","Grouser_Type","Backhoe_Mounting","Blade_Type","Travel_Controls","Differential_Type","Steering_Controls"] #"auctioneerID","state","ProductGroupDesc",,"fiSecondaryDesc" # this is redundant "fiModelDesc", and has too many options... # Q, AC, AL AR AS colDict = {} for col in parseColumns: colDict[col] = [] colMap = {} notInTest = [] for index, col in enumerate(train.columns): print "MAP:", col, index colMap[col] = index if col in parseColumns: #print "start" s = set(x for x in train[col].fillna(0)) # 0 if x == "" or not isinstance(x, float) else x s.update(x for x in test[col].fillna(0)) # math.isnan(x) colDict[col] = s print s if col == "fiBaseModel": a = set(x for x in train[col].fillna(0)) b = set(x for x in test[col].fillna(0)) print "fiBaseModel" print print # found 11 type in test not in train print [x for x in b if x not in a] print print # found several hundred in train that are not in test, try dropping these... print [x for x in a if x not in b] notInTest = [x for x in a if x not in b] SaleIDArr = [] trainSalePriceArr = [] count = 0 csv_io.delete_file("train1.csv") for row in train.iterrows(): trainSalePrice = [] rowVals = row[1].fillna(0) newSet = [] newRow = [] if rowVals["fiBaseModel"] not in notInTest: continue trainSalePrice.append(rowVals["SalePrice"]) trainSalePriceArr.append(trainSalePrice) SaleID = [] SaleID.append(rowVals["SalesID"]) SaleIDArr.append(SaleID) for col in colDict.keys(): for val in colDict[col]: if val == rowVals[col] : newRow.append(1) else: newRow.append(0) #newRow.append(rowVals["YearMade"]) # need to calculate age, sale date minus year newRow.append(rowVals["MachineHoursCurrentMeter"]) count += 1 if count % 10000 == 0: print "Count", count newSet.append(newRow) csv_io.write_delimited_file("train1.csv", newSet ,header=None, delimiter=",", filemode="a") csv_io.write_delimited_file("target.csv", trainSalePriceArr ,header=None, delimiter=",") csv_io.write_delimited_file("train_salesID.csv", SaleIDArr ,header=None, delimiter=",") # ------------------------------------------- SaleIDArr = [] count = 0 csv_io.delete_file("test1.csv") for row in test.iterrows(): rowVals = row[1].fillna(0) newSet = [] newRow = [] SaleID = [] SaleID.append(rowVals["SalesID"]) SaleIDArr.append(SaleID) for col in colDict.keys(): for val in colDict[col]: if val == rowVals[col] : newRow.append(1) else: newRow.append(0) #newRow.append(rowVals["YearMade"]) # need to calculate age, sale date minus year newRow.append(rowVals["MachineHoursCurrentMeter"]) count += 1 if count % 10000 == 0: print "Count", count newSet.append(newRow) csv_io.write_delimited_file("test1.csv", newSet ,header=None, delimiter=",", filemode="a") csv_io.write_delimited_file("test_salesID.csv", SaleIDArr ,header=None, delimiter=",") if __name__=="__main__": preprocess()
def run_stack(SEED): model = "Long-Lat KNN5 - 50 Features" print "Running GB, RF, ET stack." trainBase = csv_io.read_data("PreProcessData/training_PreProcess4_50.csv", skipFirstLine = False, split = "\t") test = csv_io.read_data("PreProcessData/test_PreProcess4_50.csv", skipFirstLine = False, split = "\t") weights = csv_io.read_data("PreProcessData/Weights.csv", skipFirstLine = False) #random.seed(SEED) #random.shuffle(trainBase) avg = 0 NumFolds = 5 # 5 is good, but 10 yeilds a better mean since outliers are less significant. (note, predictions are less reliable when using 10). predicted_list = [] bootstrapLists = [] # use this for quick runs. # note RF with 150 crashes on 30 features # GradientBoostingRegressor(learn_rate=0.05, subsample=0.5, max_depth=6, n_estimators=50), # GradientBoostingRegressor(learn_rate=0.02, subsample=0.2, max_depth=8, n_estimators=125), # RandomForestRegressor(n_estimators=100, n_jobs=1), #RandomForestRegressor(n_estimators=75, n_jobs=1), # clfs = [ExtraTreesRegressor(n_estimators=50, min_density=0.2, n_jobs=1, bootstrap=True, random_state=1), # SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0, degree=3, gamma=2**-5.5, kernel='rbf', probability=False, shrinking=True, tol=0.001, verbose=False) # ] #knn 5 at 3.45 #knn 15 at 3.31 #knn 25 at 3.30 #knn 40 at 3.31 # KNeighborsRegressor(n_neighbors=5, weights='uniform', algorithm='auto', leaf_size=30, warn_on_equidistant=False, p=2), # KNeighborsRegressor(n_neighbors=15, weights='uniform', algorithm='auto', leaf_size=30, warn_on_equidistant=False, p=2), # KNeighborsRegressor(n_neighbors=25, weights='uniform', algorithm='auto', leaf_size=30, warn_on_equidistant=False, p=2), # LinearRegression at 3.77 # Ridge at 3.77 # SGD 4.23 #Gauss at 13 # LinearRegression(fit_intercept=True, normalize=False, copy_X=True), # Ridge(alpha=1.0, fit_intercept=True, normalize=False, copy_X=True, tol=0.001), # SGDRegressor(loss='squared_loss', penalty='l2', alpha=0.0001, rho=0.84999999999999998, fit_intercept=True, n_iter=5, shuffle=False, verbose=0, epsilon=0.10000000000000001, p=None, seed=0, learning_rate='invscaling', eta0=0.01, power_t=0.25, warm_start=False), # GaussianNB() # clfs = [KNeighborsRegressor(n_neighbors=15, weights='uniform', algorithm='auto', leaf_size=30, warn_on_equidistant=False, p=2), # KNeighborsRegressor(n_neighbors=25, weights='uniform', algorithm='auto', leaf_size=30, warn_on_equidistant=False, p=2),KNeighborsRegressor(n_neighbors=35, weights='uniform', algorithm='auto', leaf_size=30, warn_on_equidistant=False, p=2) # ] # GB, 125 est is minimum, score is bad below this, explore higher and other dimensions. ****************** # clfs = [GradientBoostingRegressor(learn_rate=0.05, subsample=0.5, max_depth=6, n_estimators=100, random_state=166), # GradientBoostingRegressor(learn_rate=0.05, subsample=0.5, max_depth=8, n_estimators=100, random_state=166), # GradientBoostingRegressor(learn_rate=0.05, subsample=0.5, max_depth=10, n_estimators=100, random_state=166), # GradientBoostingRegressor(learn_rate=0.05, subsample=0.5, max_depth=6, n_estimators=200, random_state=166), # GradientBoostingRegressor(learn_rate=0.05, subsample=0.5, max_depth=8, n_estimators=200, random_state=166),GradientBoostingRegressor(learn_rate=0.05, subsample=0.5, max_depth=10, n_estimators=200, random_state=166) # ] # about 1 hour run time, and 3.10 score. #GradientBoostingRegressor(learn_rate=0.05, subsample=0.5, max_depth=6, n_estimators=400, random_state=166) # about 2 hours run time at 3.05 #GradientBoostingRegressor(learn_rate=0.05, subsample=0.5, max_depth=8, n_estimators=400, random_state=166) # about 2 hours run time at 3.06 #GradientBoostingRegressor(learn_rate=0.05, subsample=0.5, max_depth=6, n_estimators=800, random_state=166) # about 4 hours run time at 3.06 #GradientBoostingRegressor(learn_rate=0.05, subsample=0.5, max_depth=8, n_estimators=800, random_state=166) clfs = [GradientBoostingRegressor(learn_rate=0.05, subsample=0.5, max_depth=6, n_estimators=3000, random_state=166) ] # use this for quick runs. # clfs = [GradientBoostingClassifier(learn_rate=0.05, subsample=0.5, max_depth=6, n_estimators=50, random_state=166), # GradientBoostingClassifier(learn_rate=0.02, subsample=0.2, max_depth=8, n_estimators=125, random_state=551), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=80, random_state=441), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=3, n_estimators=80, random_state=331), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=4, n_estimators=80, random_state=221), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=120, random_state=91), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=3, n_estimators=120, random_state=81), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=4, n_estimators=120, random_state=71), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=160, random_state=61), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=3, n_estimators=160, random_state=51), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=4, n_estimators=160, random_state=41), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=1, n_estimators=200, random_state=31), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=200, random_state=21), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=3, n_estimators=200, random_state=10), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=4, n_estimators=200, random_state=19), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=1, n_estimators=240, random_state=18), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=240, random_state=17), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=3, n_estimators=240, random_state=16), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=1, n_estimators=280, random_state=15), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=280, random_state=14), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=3, n_estimators=280, random_state=13), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=1, n_estimators=320, random_state=12), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=320, random_state=11), # RandomForestClassifier(n_estimators=100, n_jobs=1, criterion='gini'), # RandomForestClassifier(n_estimators=100, n_jobs=1, criterion='entropy'), # ExtraTreesClassifier(n_estimators=150, min_density=0.2, n_jobs=1, criterion='gini', bootstrap=True, random_state=1), # ExtraTreesClassifier(n_estimators=150, min_density=0.2, n_jobs=1, criterion='entropy', bootstrap=True, random_state=5)] # use this for quick runs. reduced estimators to 50 # clfs = [SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0, degree=3, # gamma=2**-5.5, kernel='rbf', probability=False, shrinking=True, # tol=0.001, verbose=False) # ] #GradientBoostingRegressor(learn_rate=0.05, subsample=0.5, max_depth=6, n_estimators=50), #ExtraTreesRegressor(n_estimators=50, min_density=0.2, n_jobs=1, bootstrap=True, random_state=1) # clfs = [ExtraTreesClassifier(n_estimators=150, min_density=0.2, n_jobs=1, criterion='gini', bootstrap=True, random_state=1), # ExtraTreesClassifier(n_estimators=150, min_density=0.09, n_jobs=1, criterion='gini', bootstrap=True, random_state=2), # ExtraTreesClassifier(n_estimators=150, min_density=0.2, n_jobs=1, criterion='entropy', bootstrap=True, random_state=5), # ExtraTreesClassifier(n_estimators=150, min_density=0.09, n_jobs=1, criterion='entropy', bootstrap=True, random_state=6), # GradientBoostingClassifier(learn_rate=0.05, subsample=0.5, max_depth=6, n_estimators=50), # GradientBoostingClassifier(learn_rate=0.02, subsample=0.2, max_depth=8, n_estimators=125), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=80), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=3, n_estimators=80), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=4, n_estimators=80), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=120), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=3, n_estimators=120), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=4, n_estimators=120), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=160), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=3, n_estimators=160), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=4, n_estimators=160), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=1, n_estimators=200), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=200), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=3, n_estimators=200), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=4, n_estimators=200), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=1, n_estimators=240), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=240), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=3, n_estimators=240), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=1, n_estimators=280), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=280), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=3, n_estimators=280), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=1, n_estimators=320), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=320), # RandomForestClassifier(n_estimators=150, min_density=0.2, n_jobs=1, criterion='gini', bootstrap=True, random_state=1), # RandomForestClassifier(n_estimators=150, min_density=0.09, n_jobs=1, criterion='gini', bootstrap=True, random_state=2), # RandomForestClassifier(n_estimators=150, min_density=0.2, n_jobs=1, criterion='entropy', bootstrap=True, random_state=5), # RandomForestClassifier(n_estimators=150, min_density=0.05, n_jobs=1, criterion='entropy', bootstrap=True, random_state=7)] # full algorithm stack. # clfs = [ExtraTreesClassifier(n_estimators=150, min_density=0.2, n_jobs=1, criterion='gini', bootstrap=True, random_state=1), # ExtraTreesClassifier(n_estimators=150, min_density=0.09, n_jobs=1, criterion='gini', bootstrap=True, random_state=2), # ExtraTreesClassifier(n_estimators=150, min_density=0.05, n_jobs=1, criterion='gini', bootstrap=True, random_state=3), # ExtraTreesClassifier(n_estimators=150, min_density=0.02, n_jobs=1, criterion='gini', bootstrap=True, random_state=4), # ExtraTreesClassifier(n_estimators=150, min_density=0.2, n_jobs=1, criterion='entropy', bootstrap=True, random_state=5), # ExtraTreesClassifier(n_estimators=150, min_density=0.09, n_jobs=1, criterion='entropy', bootstrap=True, random_state=6), # ExtraTreesClassifier(n_estimators=150, min_density=0.05, n_jobs=1, criterion='entropy', bootstrap=True, random_state=7), # ExtraTreesClassifier(n_estimators=150, min_density=0.02, n_jobs=1, criterion='entropy', bootstrap=True, random_state=8), # GradientBoostingClassifier(learn_rate=0.05, subsample=0.5, max_depth=6, n_estimators=50), # GradientBoostingClassifier(learn_rate=0.02, subsample=0.2, max_depth=8, n_estimators=125), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=80), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=3, n_estimators=80), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=4, n_estimators=80), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=120), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=3, n_estimators=120), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=4, n_estimators=120), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=160), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=3, n_estimators=160), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=4, n_estimators=160), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=1, n_estimators=200), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=200), # GradientBoostingClassifier(lesarn_rate=0.01, subsample=0.2, max_depth=3, n_estimators=200), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=4, n_estimators=200), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=1, n_estimators=240), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=240), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=3, n_estimators=240), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=1, n_estimators=280), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=280), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=3, n_estimators=280), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=1, n_estimators=320), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=320), # RandomForestClassifier(n_estimators=150, min_density=0.2, n_jobs=1, criterion='gini', bootstrap=True, random_state=1), # RandomForestClassifier(n_estimators=150, min_density=0.09, n_jobs=1, criterion='gini', bootstrap=True, random_state=2), # RandomForestClassifier(n_estimators=150, min_density=0.05, n_jobs=1, criterion='gini', bootstrap=True, random_state=3), # RandomForestClassifier(n_estimators=150, min_density=0.02, n_jobs=1, criterion='gini', bootstrap=True, random_state=4), # RandomForestClassifier(n_estimators=150, min_density=0.2, n_jobs=1, criterion='entropy', bootstrap=True, random_state=5), # RandomForestClassifier(n_estimators=150, min_density=0.09, n_jobs=1, criterion='entropy', bootstrap=True, random_state=6), # RandomForestClassifier(n_estimators=150, min_density=0.05, n_jobs=1, criterion='entropy', bootstrap=True, random_state=7), # RandomForestClassifier(n_estimators=150, min_density=0.02, n_jobs=1, criterion='entropy', bootstrap=True, random_state=8)] print "Data size: ", len(trainBase), len(test) dataset_blend_train = np.zeros((len(trainBase), len(clfs))) dataset_blend_test = np.zeros((len(test), len(clfs))) trainNew = [] trainTestNew = [] testNew = [] trainNewSelect = [] trainTestNewSelect = [] testNewSelect = [] print "Scaling" targetPre = [x[0] for x in trainBase] trainPre = [x[1:] for x in trainBase] testPre = [x[0:] for x in test] #print trainPre[0] scaler = preprocessing.Scaler().fit(trainPre) trainScaled = scaler.transform(trainPre) testScaled = scaler.transform(testPre) #print scaler.mean_ #print scaler.std_ print "Begin Training" for ExecutionIndex, clf in enumerate(clfs): print str(clf) avg = 0 predicted_list = [] dataset_blend_test_set = np.zeros((len(test), NumFolds)) foldCount = 0 #Stratified for classification...[trainBase[i][0] for i in range(len(trainBase))] Folds = cross_validation.KFold(len(trainBase), k=NumFolds, indices=True) for train_index, test_index in Folds: #trainBaseTemp = [trainBase[i] for i in train_index] #target = [x[0] for x in trainBaseTemp] #train = [x[1:] for x in trainBaseTemp] #testBaseTemp = [trainBase[i] for i in test_index] #targetTest = [x[0] for x in testBaseTemp] #trainTest = [x[1:] for x in testBaseTemp] #test = [x[0:] for x in test] target = [targetPre[i] for i in train_index] train = [trainScaled[i] for i in train_index] targetTest = [targetPre[i] for i in test_index] trainTest = [trainScaled[i] for i in test_index] print print "Iteration: ", foldCount print "LEN: ", len(train), len(target) clf.fit(train, target) prob = clf.predict(trainTest) dataset_blend_train[test_index, ExecutionIndex] = prob probSum = 0 weightSum = 0 # totalOffByHalf = 0 # totalPositive = 0 # totalPositiveOffByHalf = 0 # totalPositivePredictions = 0 for i in range(0, len(prob)): probX = prob[i] probSum += weights[test_index[i]][0] * math.fabs(targetTest[i] - probX) weightSum += weights[test_index[i]][0] #print "Weight", weights[test_index[i]][0], "Index: ",i, "Test_Index: ",test_index[i] , "Actual: ", targetTest[i], "Predicted: ", probX # log loss cal #probSum += int(targetTest[i])*log(probX)+(1-int(targetTest[i]))*log(1-probX) # if ( math.fabs(probX - int(targetTest[i])) > 0.5 ): # totalOffByHalf = totalOffByHalf + 1 # if ( int(targetTest[i]) == 1 ): # totalPositive = totalPositive + 1 # if ( int(targetTest[i]) == 1 and probX < 0.5): # totalPositiveOffByHalf = totalPositiveOffByHalf + 1 # if (probX > 0.5): # totalPositivePredictions = totalPositivePredictions + 1 # print # print "Stats:" # print "Total Off By > 0.5 ", totalOffByHalf # print "Total Positive ", totalPositive # print "Total Positive Off By Half ", totalPositiveOffByHalf # print "Total Positive Predictions ", totalPositivePredictions #print -probSum/len(prob) print "Score: ", probSum/weightSum avg += (probSum/weightSum)/NumFolds predicted_probs = clf.predict(testScaled) #predicted_list.append([x[1] for x in predicted_probs]) dataset_blend_test_set[:, foldCount] = predicted_probs #[0] foldCount = foldCount + 1 dataset_blend_test[:,ExecutionIndex] = dataset_blend_test_set.mean(1) #print "Saving NP" #np.savetxt('temp/dataset_blend_test_set.txt', dataset_blend_test_set) #np.savetxt('temp/dataset_blend_test_set.mean.txt', dataset_blend_test_set.mean(1) ) #np.savetxt('temp/dataset_blend_test.txt', dataset_blend_test) #print "Done Saving NP" now = datetime.datetime.now() #print dataset_blend_test_set.mean(1) csv_io.write_delimited_file_single("../predictions_50/Stack_" + now.strftime("%Y%m%d%H%M%S") + "_" + str(avg) + "_" + str(clf)[:12] + ".csv", dataset_blend_test_set.mean(1)) csv_io.write_delimited_file_single("../predictions_50/Target_Stack_" + now.strftime("%Y%m%d%H%M%S") + "_" + str(avg) + "_" + str(clf)[:12] + ".csv", dataset_blend_train[:,ExecutionIndex] ) csv_io.write_delimited_file("../predictions_40/RunLog.csv", [now.strftime("%Y %m %d %H %M %S"), str(avg), str(clf), str(NumFolds), model, "", ""], filemode="a",delimiter=",") print now print "------------------------Average: ", avg #np.savetxt('temp/dataset_blend_train.txt', dataset_blend_train) return dataset_blend_train, dataset_blend_test
def run_stack(SEED): train, test = util.get_train_test_df() columns = set(train.columns) columns.remove("SalesID") columns.remove("SalePrice") columns.remove("saledate") train_fea = get_date_dataframe(train["saledate"]) test_fea = get_date_dataframe(test["saledate"]) for col in columns: types = set(type(x) for x in train[col]) if str in types: s = set(x for x in train[col]) str_to_categorical = defaultdict(lambda: -1, [(x[1], x[0]) for x in enumerate(s)]) train_fea = train_fea.join( pd.DataFrame( {col: [str_to_categorical[x] for x in train[col]]}, index=train.index)) test_fea = test_fea.join( pd.DataFrame({col: [str_to_categorical[x] for x in test[col]]}, index=test.index)) else: train_fea = train_fea.join(train[col]) test_fea = test_fea.join(test[col]) model = "" print "Running Stack." avg = 0 NumFolds = 5 # 5 is good, but 10 yeilds a better mean since outliers are less significant. #targetX = csv_io.read_data("target.csv", skipFirstLine = False, split = ",") #trainBase = csv_io.read_data("train1.csv", skipFirstLine = False, split = ",") #test = csv_io_np.read_data("test1.csv", skipFirstLine = False, split = ",") #trainBase = trainBase[0:5000] #targetX = targetX[0:5000] #train_saleID = csv_io.read_data("train_salesID.csv", skipFirstLine = False, split = ",") #test_salesID = csv_io.read_data("test_salesID.csv", skipFirstLine = False, split = ",") predicted_list = [] bootstrapLists = [] clfs = [ GradientBoostingRegressor(loss='lad', learn_rate=0.05, subsample=0.5, max_depth=6, n_estimators=3000, random_state=166, min_samples_leaf=1) ] #GradientBoostingRegressor(loss='ls', learn_rate=0.05, subsample=0.5, max_depth=6, n_estimators=300, random_state=166, min_samples_leaf=1) #GradientBoostingRegressor(loss='lad', learn_rate=0.05, subsample=0.5, max_depth=6, n_estimators=3000, random_state=166, min_samples_leaf=1), #GradientBoostingRegressor(loss='huber', learn_rate=0.05, subsample=0.5, max_depth=6, n_estimators=3000, random_state=166, min_samples_leaf=1), #train_fea, train["SalePrice"] print "Data size: ", len(train_fea), len(test_fea) #dataset_blend_train = np.zeros((len(train_fea), len(clfs))) #dataset_blend_test = np.zeros((len(test), len(clfs))) dataset_blend_test = np.zeros( (len(test_fea), len(clfs))) # np.zeros(len(train_fea), len(clfs)) dataset_blend_train = np.zeros((len(train_fea), len(clfs))) print "Begin Training" lenTrainBase = 401125 # len(train_fea) lenTest = 11573 # len(test_fea) gc.collect() for ExecutionIndex, clf in enumerate(clfs): print clf avg = 0 predicted_list = [] dataset_blend_test_set = np.zeros((lenTest, NumFolds)) foldCount = 0 #Stratified for classification...[trainBase[i][0] for i in range(len(trainBase))] Folds = cross_validation.KFold(lenTrainBase, k=NumFolds, indices=True) for train_index, test_index in Folds: targetX = [train["SalePrice"][i] for i in train_index] trainX = [train_fea.ix[i] for i in train_index] targetTest = [train["SalePrice"][i] for i in test_index] trainTest = [train_fea.ix[i] for i in test_index] gc.collect() print print "Iteration: ", foldCount print "LEN: ", len(trainX), len(targetX) #print trainX[0] #print target[0] #return print "Start", datetime.datetime.now() clf.fit(trainX, targetX) prob = clf.predict(trainTest) print "End ", datetime.datetime.now() dataset_blend_train[test_index, ExecutionIndex] = prob gc.collect() probSum = 0 weightSum = 0 # totalOffByHalf = 0 # totalPositive = 0 # totalPositiveOffByHalf = 0 # totalPositivePredictions = 0 for i in range(0, len(prob)): probX = prob[i] #print targetTest[i], probX if probX < 0: # some are comming out negative. probX = -probX probSum += math.pow( math.log10(targetTest[i]) - math.log10(probX), 2) #probSum += weights[test_index[i]][0] * math.fabs(targetTest[i] - probX) #weightSum += weights[test_index[i]][0] #print "Weight", weights[test_index[i]][0], "Index: ",i, "Test_Index: ",test_index[i] , "Actual: ", targetTest[i], "Predicted: ", probX # log loss cal #probSum += int(targetTest[i])*log(probX)+(1-int(targetTest[i]))*log(1-probX) # if ( math.fabs(probX - int(targetTest[i])) > 0.5 ): # totalOffByHalf = totalOffByHalf + 1 # if ( int(targetTest[i]) == 1 ): # totalPositive = totalPositive + 1 # if ( int(targetTest[i]) == 1 and probX < 0.5): # totalPositiveOffByHalf = totalPositiveOffByHalf + 1 # if (probX > 0.5): # totalPositivePredictions = totalPositivePredictions + 1 # print # print "Stats:" # print "Total Off By > 0.5 ", totalOffByHalf # print "Total Positive ", totalPositive # print "Total Positive Off By Half ", totalPositiveOffByHalf # print "Total Positive Predictions ", totalPositivePredictions #print -probSum/len(prob) print "Score: ", math.sqrt(probSum / len(prob)) avg += math.sqrt(probSum / len(prob)) / NumFolds gc.collect() predicted_probs = [] for i in range(0, lenTest): predicted_probs.append(clf.predict(test_fea.ix[i])) #predicted_probs = clf.predict(testScaled) #predicted_list.append([x[1] for x in predicted_probs]) dataset_blend_test_set[:, foldCount] = predicted_probs #[0] gc.collect() foldCount = foldCount + 1 dataset_blend_test[:, ExecutionIndex] = dataset_blend_test_set.mean(1) #print "Saving NP" #np.savetxt('temp/dataset_blend_test_set.txt', dataset_blend_test_set) #np.savetxt('temp/dataset_blend_test_set.mean.txt', dataset_blend_test_set.mean(1) ) #np.savetxt('temp/dataset_blend_test.txt', dataset_blend_test) #print "Done Saving NP" now = datetime.datetime.now() #print dataset_blend_test_set.mean(1) csv_io.write_delimited_file_single( "../predictions/Stack_" + now.strftime("%Y%m%d%H%M%S") + "_" + str(avg) + "_" + str(clf)[:12] + ".csv", dataset_blend_test_set.mean(1)) csv_io.write_delimited_file_single( "../predictions/Target_Stack_" + now.strftime("%Y%m%d%H%M%S") + "_" + str(avg) + "_" + str(clf)[:12] + ".csv", dataset_blend_train[:, ExecutionIndex]) csv_io.write_delimited_file("../predictions/RunLog.csv", [ now.strftime("%Y %m %d %H %M %S"), "AVG.", str(avg), str(clf), "Folds:", str(NumFolds), "Model", model, "", "" ], filemode="a", delimiter=",") print "------------------------Average: ", avg #np.savetxt('temp/dataset_blend_train.txt', dataset_blend_train) return dataset_blend_train, dataset_blend_test
def preprocess(): train, test = util.get_train_test_df() columns = set(train.columns) #columns.remove("SalesID") #columns.remove("SalePrice") #columns.remove("saledate") #train_fea = get_date_dataframe(train["saledate"]) #test_fea = get_date_dataframe(test["saledate"]) #parseColumns = ["UsageBand"] parseColumns = [ "UsageBand","fiBaseModel","fiModelSeries","fiModelDescriptor","ProductSize","ProductGroup","Drive_System","Enclosure","Forks","Pad_Type","Ride_Control","Stick","Transmission","Turbocharged","Blade_Extension","Blade_Width","Enclosure_Type","Engine_Horsepower","Hydraulics","Pushblock","Ripper","Scarifier","Tip_ControlCoupler","Coupler_System","Grouser_Tracks","Hydraulics_Flow","Track_Type","Thumb","Pattern_Changer","Grouser_Type","Backhoe_Mounting","Blade_Type","Travel_Controls","Differential_Type","Steering_Controls"] #"auctioneerID","state","ProductGroupDesc",,"fiSecondaryDesc" # this is redundant "fiModelDesc", and has too many options... # Q, AC, AL AR AS colDict = {} for col in parseColumns: colDict[col] = [] colMap = {} notInTest = [] for index, col in enumerate(train.columns): print "MAP:", col, index colMap[col] = index if col in parseColumns: #print "start" s = set(x for x in train[col].fillna(0)) # 0 if x == "" or not isinstance(x, float) else x s.update(x for x in test[col].fillna(0)) # math.isnan(x) colDict[col] = s print s if col == "fiBaseModel": a = set(x for x in train[col].fillna(0)) b = set(x for x in test[col].fillna(0)) print "fiBaseModel" print print # found 11 type in test not in train print [x for x in b if x not in a] print print # found several hundred in train that are not in test, try dropping these... print [x for x in a if x not in b] notInTest = [x for x in a if x not in b] SaleIDArr = [] trainSalePriceArr = [] count = 0 csv_io.delete_file("train1.csv") for row in train.iterrows(): trainSalePrice = [] rowVals = row[1].fillna(0) newSet = [] newRow = [] if rowVals["fiBaseModel"] not in notInTest: continue trainSalePrice.append(rowVals["SalePrice"]) trainSalePriceArr.append(trainSalePrice) SaleID = [] SaleID.append(rowVals["SalesID"]) SaleIDArr.append(SaleID) for col in colDict.keys(): for val in colDict[col]: if val == rowVals[col] : newRow.append(1) else: newRow.append(0) #newRow.append(rowVals["YearMade"]) # need to calculate age, sale date minus year newRow.append(rowVals["MachineHoursCurrentMeter"]) count += 1 if count % 10000 == 0: print "Count", count newSet.append(newRow) csv_io.write_delimited_file("train1.csv", newSet ,header=None, delimiter=",", filemode="a") csv_io.write_delimited_file("target.csv", trainSalePriceArr ,header=None, delimiter=",") csv_io.write_delimited_file("train_salesID.csv", SaleIDArr ,header=None, delimiter=",") # ------------------------------------------- SaleIDArr = [] count = 0 csv_io.delete_file("test1.csv") for row in test.iterrows(): rowVals = row[1].fillna(0) newSet = [] newRow = [] SaleID = [] SaleID.append(rowVals["SalesID"]) SaleIDArr.append(SaleID) for col in colDict.keys(): for val in colDict[col]: if val == rowVals[col] : newRow.append(1) else: newRow.append(0) #newRow.append(rowVals["YearMade"]) # need to calculate age, sale date minus year newRow.append(rowVals["MachineHoursCurrentMeter"]) count += 1 if count % 10000 == 0: print "Count", count newSet.append(newRow) csv_io.write_delimited_file("test1.csv", newSet ,header=None, delimiter=",", filemode="a") csv_io.write_delimited_file("test_salesID.csv", SaleIDArr ,header=None, delimiter=",")
print "Score: ", auc avg += auc/NumFolds predicted_probs = clf.predict_proba(finalTestSparse) #predicted_list.append([x[1] for x in predicted_probs]) dataset_blend_test_set[:, foldCount] = predicted_probs[:,1] foldCount = foldCount + 1 #break dataset_blend_test[:,ExecutionIndex] = dataset_blend_test_set.mean(1) now = datetime.datetime.now() csv_io.write_delimited_file_single_plus_index("../predictions/Stack_" + now.strftime("%Y%m%d%H%M%S") + "_" + str(avg) + "_" + str(clf)[:12] + ".csv", dataset_blend_test_set.mean(1)) csv_io.write_delimited_file_single("../predictions/Target_Stack_" + now.strftime("%Y%m%d%H%M%S") + "_" + str(avg) + "_" + str(clf)[:12] + ".csv", dataset_blend_train[:,ExecutionIndex] ) csv_io.write_delimited_file("../predictions/RunLog.csv", [now.strftime("%Y %m %d %H %M %S"), str(avg), str(clf), str(NumFolds), model, "", ""], filemode="a",delimiter=",") print "------------------------Average: ", avg
def run_stack(SEED): model = "Lasso" trainBaseTarget = pd.read_csv('../data/pre_shuffled_target.csv') trainBase = pd.read_csv('../models/' + model + '_train.csv') trainBaseWeight = trainBase['var11'] test = pd.read_csv('../models/' + model + '_test.csv') #trainBase = shuffle(trainBase, random_state = SEED) print(trainBase.columns) trainBaseID = trainBase['id'] testID = test['id'] trainBase = np.nan_to_num(np.array(trainBase)) targetBase = np.nan_to_num(np.array(trainBaseTarget)) test = np.nan_to_num(np.array(test)) avg = 0 NumFolds = 5 #GradientBoostingRegressor(loss='ls', learning_rate=0.05, subsample=0.5, max_depth=10, n_estimators=30, random_state=166, min_samples_leaf=1), #Ridge() clfs = [ LinearRegression(fit_intercept=True, normalize=False, copy_X=True) #BaggingRegressor(base_estimator=Ridge(), n_estimators=500, max_samples=1.0, max_features=1.0, bootstrap=True, bootstrap_features=False, oob_score=False, n_jobs=1, random_state=None, verbose=0) #AdaBoostRegressor(base_estimator=Ridge(), n_estimators=50, learning_rate=1.0, loss='linear', random_state=None) #Lasso(alpha=0.0000329034456231), #Ridge(), #RandomForestRegressor(n_estimators=3000, criterion='mse', max_depth=None, min_samples_split=2, min_samples_leaf=1, max_features='auto', max_leaf_nodes=None, bootstrap=True, oob_score=False, n_jobs=1, random_state=None, verbose=0, min_density=None, compute_importances=None), #GradientBoostingRegressor(loss='ls', learning_rate=0.05, subsample=0.5, max_depth=10, n_estimators=30, random_state=166, min_samples_leaf=1), #GradientBoostingRegressor(loss='ls', learning_rate=0.05, subsample=0.5, max_depth=10, n_estimators=100, random_state=166, min_samples_leaf=1), #GradientBoostingRegressor(loss='ls', learning_rate=0.05, subsample=0.5, max_depth=10, n_estimators=300, random_state=166, min_samples_leaf=1), #GradientBoostingRegressor(loss='ls', learning_rate=0.05, subsample=0.5, max_depth=10, n_estimators=1000, random_state=166, min_samples_leaf=1), #GradientBoostingRegressor(loss='ls', learning_rate=0.05, subsample=0.5, max_depth=10, n_estimators=3000, random_state=166, min_samples_leaf=1), ] print ("Data size: " + str(len(trainBase)) + " " + str(len(test))) dataset_blend_train = np.zeros((len(trainBase), len(clfs))) dataset_blend_test = np.zeros((len(test), len(clfs))) print("Begin Training") lenTrainBase = len(trainBase) lenTest = len(test) gc.collect() for ExecutionIndex, clf in enumerate(clfs): print(clf) avg = 0 dataset_blend_test_set = np.zeros((lenTest, NumFolds)) foldCount = 0 Folds = cross_validation.KFold(lenTrainBase, n_folds=NumFolds, indices=True) for train_index, test_index in Folds: print() print ("Iteration: " + str(foldCount)) now = datetime.datetime.now() print(now.strftime("%Y/%m/%d %H:%M:%S")) target = [targetBase[i] for i in train_index] train = [trainBase[i] for i in train_index] weight = [trainBaseWeight[i] for i in train_index] targetTest = [targetBase[i] for i in test_index] trainTest = [trainBase[i] for i in test_index] weightTest = [trainBaseWeight[i] for i in test_index] #print "LEN: ", len(train), len(target) target = np.array(np.reshape(target, (-1, 1)) ) #train = np.array(np.reshape(train, (-1, 1)) ) weight = np.array(np.reshape(weight, (-1, 1))) targetTest = np.array(np.reshape(targetTest, (-1, 1)) ) #trainTest = np.array(np.reshape(trainTest, (-1, 1)) ) weightTest = np.array(np.reshape(weightTest, (-1, 1))) #clf.fit(train, target, sample_weight = weight clf.fit(train, target) predicted = clf.predict(trainTest) #print(predicted[:,0]) print(test_index) dataset_blend_train[test_index, ExecutionIndex] = predicted[:,0] #needed for Ridge #print(targetTest.shape) #print(prpredictedob.shape) #print(weightTest.shape) print(str(score.normalized_weighted_gini(targetTest.ravel(), predicted.ravel(), weightTest.ravel()))) avg += score.normalized_weighted_gini(targetTest.ravel(), predicted.ravel(), weightTest.ravel())/NumFolds predicted[predicted[:,0] < 0.0] = 0.0 print(str(score.normalized_weighted_gini(targetTest.ravel(), predicted.ravel(), weightTest.ravel()))) avg += score.normalized_weighted_gini(targetTest.ravel(), predicted.ravel(), weightTest.ravel())/NumFolds predicted = clf.predict(test) dataset_blend_test_set[:, foldCount] = predicted[:,0] foldCount = foldCount + 1 dataset_blend_test[:,ExecutionIndex] = dataset_blend_test_set.mean(1) now = datetime.datetime.now() #print dataset_blend_test_set.mean(1) #csv_io.write_delimited_file_single("../predictions/Stack_" + now.strftime("%Y%m%d%H%M%S") + "_" + str(avg) + "_" + str(clf)[:12] + ".csv", dataset_blend_test_set.mean(1)) submission = pd.DataFrame(np.zeros((len(testID), 2)), columns=['id', 'target']) submission['target'] = dataset_blend_test[:,ExecutionIndex] submission['id'] = testID submission.to_csv("../predictions/Stack_" + now.strftime("%Y%m%d%H%M%S") + "_" + str(avg) + "_" + str(clf)[:12] + ".csv", index = False) #csv_io.write_delimited_file_single("../predictions/Target_Stack_" + now.strftime("%Y%m%d%H%M%S") + "_" + str(avg) + "_" + str(clf)[:12] + ".csv", dataset_blend_train[:,ExecutionIndex] ) submission = pd.DataFrame(np.zeros((len(trainBaseID), 2)), columns=['id', 'target']) submission['target'] = dataset_blend_train[:,ExecutionIndex] submission['id'] = trainBaseID submission.to_csv("../predictions/Target_Stack_" + now.strftime("%Y%m%d%H%M%S") + "_" + str(avg) + "_" + str(clf)[:12] + ".csv", index = False) csv_io.write_delimited_file("../log/RunLog.csv", [now.strftime("%Y %m %d %H %M %S"), "AVG." , str(avg), str(clf), "Folds:", str(NumFolds), "Model", "", "", ""], filemode="a",delimiter=",") print ("------------------------Average: " + str(avg)) #np.savetxt('temp/dataset_blend_train.txt', dataset_blend_train) return dataset_blend_train, dataset_blend_test
def run_stack(SEED): model = "base" trainBase = csv_io_np.read_data("PreProcessData/train.csv", skipFirstLine=True, split=",") test = csv_io_np.read_data("PreProcessData/test.csv", skipFirstLine=True, split=",") print "Data Read Complete" avg = 0 NumFolds = 5 predicted_list = [] bootstrapLists = [] # 100 producted 94% # 1000 did not finish in about 5+ hours... # 300 about 5 hours, .9691 on first CF # learn_rate=0.01, n_estimators=300, subsample=1.0, min_samples_split=30, 0.9386 # GradientBoostingClassifier(loss='deviance', learn_rate=0.1, n_estimators=300, subsample=1.0, min_samples_split=30, min_samples_leaf=1, max_depth=5, init=None, random_state=None, max_features=None) clfs = [ SVC(C=1000000, kernel='rbf', degree=3, gamma=0.0000001, coef0=0.0, shrinking=True, probability=False, tol=0.001, cache_size=200, class_weight=None, verbose=False) ] print "Data size: ", len(trainBase), len(test) dataset_blend_train = np.zeros((len(trainBase), len(clfs))) dataset_blend_test = np.zeros((len(test), len(clfs))) trainNew = [] trainTestNew = [] testNew = [] trainNewSelect = [] trainTestNewSelect = [] testNewSelect = [] print "Scaling" targetPre = [x[0] for x in trainBase] trainPre = [x[1:] for x in trainBase] testPre = [x[0:] for x in test] #print trainPre[0] #scaler = preprocessing.Scaler().fit(trainPre) #trainScaled = scaler.transform(trainPre) #testScaled = scaler.transform(testPre) trainScaled = trainPre testScaled = testPre #print scaler.mean_ #print scaler.std_ print "Begin Training" lenTrainBase = len(trainBase) trainBase = [] lenTest = len(test) test = [] trainPre = [] testPre = [] gc.collect() CC = [6] gg = [-6.36, -6.35, -6.34, -6.33, -6.32] for ExecutionIndex, clf in enumerate(clfs): print clf avg = 0 predicted_list = [] dataset_blend_test_set = np.zeros((lenTest, NumFolds)) foldCount = 0 avg = 0 #Stratified for classification...[trainBase[i][0] for i in range(len(trainBase))] Folds = cross_validation.KFold(lenTrainBase, k=NumFolds, indices=True) for C in CC: for g in gg: for train_index, test_index in Folds: print "g:", g, "C:", C #trainBaseTemp = [trainBase[i] for i in train_index] #target = [x[0] for x in trainBaseTemp] #train = [x[1:] for x in trainBaseTemp] #testBaseTemp = [trainBase[i] for i in test_index] #targetTest = [x[0] for x in testBaseTemp] #trainTest = [x[1:] for x in testBaseTemp] #test = [x[0:] for x in test] target = [targetPre[i] for i in train_index] train = [trainScaled[i] for i in train_index] targetTest = [targetPre[i] for i in test_index] trainTest = [trainScaled[i] for i in test_index] print print "Iteration: ", foldCount print "LEN: ", len(train), len( train[0]), len(target), len(trainTest), len( trainTest[0]) clf = SVC(C=10**C, kernel='rbf', degree=4, gamma=10**g, coef0=0.0, shrinking=True, probability=False, tol=0.001, cache_size=200, class_weight=None, verbose=False) #clf.set_params(C=10**C, gamma=2**g) print datetime.datetime.now() clf.fit(train, target) print datetime.datetime.now() prob = clf.predict(trainTest) dataset_blend_train[test_index, ExecutionIndex] = prob probSum = 0.0 count = 0.0 for i in range(0, len(prob)): probX = prob[i] #[1] #print probX, targetTest[i] if (targetTest[i] == probX): probSum += 1.0 count = count + 1.0 print "Sum: ", probSum, count print "Score: ", probSum / count avg += (probSum / count) / NumFolds #predicted_probs = clf.predict(testScaled) ######predicted_list.append([x[1] for x in predicted_probs]) #dataset_blend_test_set[:, foldCount] = predicted_probs #[0] foldCount = foldCount + 1 break #dataset_blend_test[:,ExecutionIndex] = dataset_blend_test_set.mean(1) now = datetime.datetime.now() #csv_io.write_delimited_file_single("../predictions/Stack_" + now.strftime("%Y%m%d%H%M%S") + "_" + str(avg) + "_" + str(clf)[:12] + ".csv", dataset_blend_test_set.mean(1)) #csv_io.write_delimited_file_single("../predictions/Target_Stack_" + now.strftime("%Y%m%d%H%M%S") + "_" + str(avg) + "_" + str(clf)[:12] + ".csv", dataset_blend_train[:,ExecutionIndex] ) csv_io.write_delimited_file("../tune/TuneLog.csv", [ now.strftime("%Y %m %d %H %M %S"), "Score:", str(avg * NumFolds), str(clf), "Folds:", str(NumFolds), "Model", model, "", "" ], filemode="a", delimiter=",") #print "------------------------Average: ", avg return dataset_blend_train, dataset_blend_test
def PreProcess5(): trainBase = csv_io.read_data("PreProcessData/training_PreProcess4.csv", skipFirstLine = False, split = "\t") test = csv_io.read_data("PreProcessData/test_PreProcess4.csv", skipFirstLine = False, split = "\t") shutil.copy2("PreProcessData/DataClassList4.csv", "PreProcessData/DataClassList5.csv") target = [x[0] for x in trainBase] train = [x[1:] for x in trainBase] DataClassList = csv_io.read_data("PreProcessData/DataClassList5.csv", False) print "Data len: ", len(train[0]) print "DataClassList len: ", len(DataClassList) #return # this seems about optimal, but has not been tuned on latest improvements. NumFeatures = 40 # NOTE going from 30 to 20 features on KNN5 set has almost no effect. Down to 15 is significant loss. # for GBM at 6 and 400 30 is 3.01 and 30 3.05. print "Scaling" targetPre = [x[0] for x in trainBase] trainPre = [x[1:] for x in trainBase] testPre = [x[0:] for x in test] #print trainPre[0] scaler = preprocessing.Scaler().fit(trainPre) trainScaled = scaler.transform(trainPre) #testScaled = scaler.transform(testPre) #clf = RandomForestClassifier(n_estimators=100, n_jobs=1, criterion='gini',compute_importances=True) clf = RandomForestRegressor(n_estimators=25, n_jobs=1,compute_importances=True) #clf = ExtraTreesClassifier(n_estimators=150, min_density=0.02, n_jobs=1, criterion='gini', bootstrap=True,compute_importances=True) print "Training" # producing memory errors, probably too much data. # recommend to use linear lasso. #est = LinearRegression(fit_intercept=True, normalize=False, copy_X=True) #selector = RFE(est, 20, step=10) #selector = selector.fit(trainScaled, target) #print selector.support_ #print selector.ranking_ #return #trainPost = selector.transform(trainPre) #testPost = selector.transform(testPre) clf.fit(trainScaled, target) trainNew = [] testNew = [] print "Computing Importances" importances = clf.feature_importances_ DataClassListNew = [] for DataIndex, DataClass in enumerate(DataClassList): print DataClass[0], importances[DataIndex]; DataClassListNew.append([DataClass[0], importances[DataIndex]]) csv_io.write_delimited_file("PreProcessData/DataClassList_Importances_" + str(NumFeatures) + ".csv", DataClassListNew) DataClassListNew_temp = sorted(DataClassListNew, key=operator.itemgetter(1), reverse=True) csv_io.write_delimited_file("PreProcessData/DataClassList_Importances_sorted_" + str(NumFeatures) + ".csv", DataClassListNew_temp) importancesTemp = sorted(importances, reverse=True) print len(importancesTemp), "importances" if ( len(importancesTemp) > NumFeatures): threshold = importancesTemp[NumFeatures] print "Importance threshold: ", threshold rowIndex = 0 for row in train: newRow = [] for impIndex, importance in enumerate(importances): if ( impIndex == 0): newRow.append(target[rowIndex]) if ( importance > threshold ): newRow.append(row[impIndex]) trainNew.append(newRow) rowIndex += 1 for row in test: newRow = [] for impIndex, importance in enumerate(importances): if ( importance > threshold ) : newRow.append(row[impIndex]) testNew.append(newRow) csv_io.write_delimited_file("PreProcessData/training_PreProcess5_" + str(NumFeatures) + ".csv", trainNew, delimiter="\t") csv_io.write_delimited_file("PreProcessData/test_PreProcess5_" + str(NumFeatures) + ".csv", testNew, delimiter="\t")
print(str(averageSet[idx][ExecutionIndex])) average += averageSet[idx][ExecutionIndex] submission1[col] = dataset_blend_testSet[idx][:, ExecutionIndex] submission2[col] = dataset_blend_trainSet[idx][:, ExecutionIndex] average = average / 5 now = datetime.datetime.now() submission1.to_csv("../submission/Stack" + dset + "_" + now.strftime("%Y%m%d%H%M%S") + "_" + str(average) + "_" + str(clf)[:12] + ".csv", index=False) submission2.to_csv("../submission/Target_Stack" + dset + "_" + now.strftime("%Y%m%d%H%M%S") + "_" + str(average) + "_" + str(clf)[:12] + ".csv", index=False) csv_io.write_delimited_file("../log/RunLog.csv", [ now.strftime("%Y %m %d %H %M %S"), "AVG.", str(average), str(clf), "Folds:", str(NumFolds), "Model", model, "dset", dset ], filemode="a", delimiter=",") print("------------------------Final Average: " + str(average))