def main(): #read in data, parse into training and target sets train = csv.read_data("../Data/train.csv") target = np.array( [x[0] for x in train] ) train = np.array( [x[1:280] for x in train] ) #In this case we'll use a random forest, but this could be any classifier cfr = RandomForestClassifier(n_estimators=120, min_samples_split=2, n_jobs=-1, max_depth=None) #.46 #cfr = GradientBoostingClassifier(n_estimators=120, learn_rate=0.57, max_depth=1) #.50 #cfr = ExtraTreesClassifier(n_estimators=120, max_depth=None, min_samples_split=1) #.489 #Simple K-Fold cross validation. 5 folds. cv = cross_validation.KFold(len(train), k=5, indices=False) #iterate through the training and test cross validation segments and #run the classifier on each one, aggregating the results into a list results = [] count = 0 for traincv, testcv in cv: probas = cfr.fit(train[traincv], target[traincv]).predict_proba(train[testcv]) result = logloss.llfun(target[testcv], [x[1] for x in probas]) count += 1 print('fold: %d, result: %f' % (count, result)) results.append( result ) #print out the mean of the cross-validated results print "Results: " + str( np.array(results).mean() ) test = csv.read_data("../Data/test.csv") predicted_probs = cfr.predict_proba( [x[0:279] for x in test]) predicted_probs = ["%f" % x[1] for x in predicted_probs] csv.write_delimited_file("../Submissions/rf_cv.csv", predicted_probs)
def Analyze1(): Threshold = 4.0 targetFile = "Target_Stack_20121017110223_3.06649134025_GradientBoos.csv" trainBase = csv_io.read_data("PreProcessData/training_PreProcess3.csv", skipFirstLine = False, split = "\t") shutil.copy2("PreProcessData/test_PreProcess3.csv", "PreProcessData/test_PreProcess8.csv") shutil.copy2("PreProcessData/DataClassList3.csv", "PreProcessData/DataClassList8.csv") weights = csv_io.read_data("PreProcessData/Weights.csv", skipFirstLine = False) target = [x[0] for x in trainBase] print "Loading Data" trainNew = [] probSum = 0.0 weightSum = 0 trn = csv_io.read_data("../predictions/" + targetFile, split="," ,skipFirstLine = False) for row, datum in enumerate(trn): if ( abs(datum[0] - target[row]) > Threshold): print datum[0], target[row] trainNew.append(trainBase[row]) probSum += weights[row][0] * math.fabs(target[row] - datum[0]) weightSum += weights[row][0] print "Train Score: ", (probSum/weightSum) print len(trainNew) csv_io.write_delimited_file("PreProcessData/training_PreProcess8" + ".csv", trainNew, delimiter="\t")
def main(): #read in the training file train = csv_io.read_data("../data/train.csv") #set the training responses target = [x[0] for x in train] #set the training features train = [x[1:] for x in train] #read in the test file realtest = csv_io.read_data("../data/test.csv") # random forest code rf = RandomForestClassifier(n_estimators=150, min_samples_split=2, n_jobs=-1) # fit the training data print('fitting the model') rf.fit(train, target) # run model against test data predicted_probs = rf.predict_proba(realtest) predicted_probs = ["%f" % x[1] for x in predicted_probs] csv_io.write_delimited_file("random_forest_solution.csv", predicted_probs) print( 'Random Forest Complete! You Rock! Submit random_forest_solution.csv to Kaggle' )
def main(): #read in the training file train = csv_io.read_data("data/train.csv") target = ravel(csv_io.read_data("data/trainLabels.csv")) realtest = csv_io.read_data("data/test.csv") print len(realtest) # random forest code rf = RandomForestClassifier(n_estimators=150, min_samples_split=2, n_jobs=-1, random_state=1, oob_score=True) # fit the training data print('fitting the model') rf.fit(train, target) # run model against test data predicted_probs = rf.predict_proba(realtest) predicted_class = rf.predict(realtest) print predicted_class[1:10] print(len(predicted_class)) predicted_probs = ["%f" % x[1] for x in predicted_probs] predicted_class = ["%d,%d" % (i+1, predicted_class[i]) for i in range(len(predicted_class))] print predicted_class[0:9] print(len(predicted_class)) csv_io.write_delimited_file("results/random_forest_solution.csv", predicted_class, header=['Id', 'Solution'])
def PreProcess3(): trainBase = csv_io.read_data( "PreProcessData/training_PreProcess2_temp.csv", False) test = csv_io.read_data("PreProcessData/test_PreProcess2_temp.csv", False) target = [x[0] for x in trainBase] train = [x[1:] for x in trainBase] NumFeatures = 200 #clf = RandomForestClassifier(n_estimators=100, n_jobs=1, criterion='gini',compute_importances=True) chi = chi2(train, target) print "Training" #clf.fit(train, target) chi = SelectKBest(chi2, k=NumFeatures).fit(train, target) print chi.get_support(indices=True) print chi.transform(X), np.array(train)[:, [0]] return trainNew = [] testNew = [] print "Computing Importances" importances = clf.feature_importances_ #print importances importancesTemp = sorted(importances, reverse=True) print len(importancesTemp), "importances" if (len(importancesTemp) > NumFeatures): threshold = importancesTemp[NumFeatures] #print "Sorted and deleted importances" #print importancesTemp rowIndex = 0 for row in train: newRow = [] for impIndex, importance in enumerate(importances): if (impIndex == 0): newRow.append(target[rowIndex]) if (importance > threshold): newRow.append(row[impIndex]) trainNew.append(newRow) rowIndex += 1 for row in test: newRow = [] for impIndex, importance in enumerate(importances): if (importance > threshold): #print impIndex, len(importances) newRow.append(row[impIndex]) testNew.append(newRow) csv_io.write_delimited_file("PreProcessData/training_PreProcess2_chi.csv", trainNew) csv_io.write_delimited_file("PreProcessData/test_PreProcess2_chi.csv", testNew)
def Blend(): trainBase = csv_io.read_data("PreProcessData/training_PreProcess4.csv", skipFirstLine = False, split = "\t") weights = csv_io.read_data("PreProcessData/Weights.csv", skipFirstLine = False) SEED = 448 #random.seed(SEED) #random.shuffle(trainBase) target = [x[0] for x in trainBase] dataset_blend_train, dataset_blend_test = stack.run_stack(SEED) clfs = [LinearRegression(fit_intercept=True, normalize=False, copy_X=True) ] # clfs = [LogisticRegression(penalty='l2', dual=False, tol=0.0001, C=1.0, fit_intercept=True, intercept_scaling=1, class_weight=None), # LogisticRegression(penalty='l2', dual=False, tol=0.0001, C=0.5, fit_intercept=True, intercept_scaling=1, class_weight=None), # LogisticRegression(penalty='l2', dual=False, tol=0.0001, C=0.1, fit_intercept=True, intercept_scaling=1, class_weight=None), # LogisticRegression(penalty='l1', dual=False, tol=0.0001, C=1.0, fit_intercept=True, intercept_scaling=1, class_weight=None), # LogisticRegression(penalty='l1', dual=False, tol=0.0001, C=0.5, fit_intercept=True, intercept_scaling=1, class_weight=None), # LogisticRegression(penalty='l1', dual=False, tol=0.0001, C=0.1, fit_intercept=True, intercept_scaling=1, class_weight=None)] test = csv_io.read_data("PreProcessData/test_PreProcess4.csv", False) dataset_blend_test_set = np.zeros((len(test), len(clfs))) for ExecutionIndex, clf in enumerate(clfs): clf.fit(dataset_blend_train, target) submission = clf.predict(dataset_blend_test) submission = ["%f" % x for x in submission] now = datetime.datetime.now() csv_io.write_delimited_file("../Submissions/BlendSingle" + now.strftime("%Y%m%d%H%M%S") + ".csv", submission) # attempt to score the training set to predict score for blend... probSum = 0.0 weightSum = 0 trainPrediction = clf.predict(dataset_blend_train) for i in range(0, len(trainPrediction)): probX = trainPrediction[i] probSum += weights[i][0] * math.fabs(target[i] - probX) weightSum += weights[i][0] #probSum += int(target[i])*log(probX)+(1-int(target[i]))*log(1-probX) print "Train Score: ", (probSum/weightSum) dataset_blend_test_set[:, ExecutionIndex] = submission csv_io.write_delimited_file_single("../Submissions/FinalBlend_" + now.strftime("%Y%m%d%H%M%S") + ".csv", dataset_blend_test_set.mean(1))
def Blend(): trainBase = csv_io.read_data("PreProcessData/training_PreProcess4.csv", skipFirstLine=False, split="\t") weights = csv_io.read_data("PreProcessData/Weights.csv", skipFirstLine=False) SEED = 448 #random.seed(SEED) #random.shuffle(trainBase) target = [x[0] for x in trainBase] dataset_blend_train, dataset_blend_test = stack.run_stack(SEED) clfs = [LinearRegression(fit_intercept=True, normalize=False, copy_X=True)] # clfs = [LogisticRegression(penalty='l2', dual=False, tol=0.0001, C=1.0, fit_intercept=True, intercept_scaling=1, class_weight=None), # LogisticRegression(penalty='l2', dual=False, tol=0.0001, C=0.5, fit_intercept=True, intercept_scaling=1, class_weight=None), # LogisticRegression(penalty='l2', dual=False, tol=0.0001, C=0.1, fit_intercept=True, intercept_scaling=1, class_weight=None), # LogisticRegression(penalty='l1', dual=False, tol=0.0001, C=1.0, fit_intercept=True, intercept_scaling=1, class_weight=None), # LogisticRegression(penalty='l1', dual=False, tol=0.0001, C=0.5, fit_intercept=True, intercept_scaling=1, class_weight=None), # LogisticRegression(penalty='l1', dual=False, tol=0.0001, C=0.1, fit_intercept=True, intercept_scaling=1, class_weight=None)] test = csv_io.read_data("PreProcessData/test_PreProcess4.csv", False) dataset_blend_test_set = np.zeros((len(test), len(clfs))) for ExecutionIndex, clf in enumerate(clfs): clf.fit(dataset_blend_train, target) submission = clf.predict(dataset_blend_test) submission = ["%f" % x for x in submission] now = datetime.datetime.now() csv_io.write_delimited_file( "../Submissions/BlendSingle" + now.strftime("%Y%m%d%H%M%S") + ".csv", submission) # attempt to score the training set to predict score for blend... probSum = 0.0 weightSum = 0 trainPrediction = clf.predict(dataset_blend_train) for i in range(0, len(trainPrediction)): probX = trainPrediction[i] probSum += weights[i][0] * math.fabs(target[i] - probX) weightSum += weights[i][0] #probSum += int(target[i])*log(probX)+(1-int(target[i]))*log(1-probX) print "Train Score: ", (probSum / weightSum) dataset_blend_test_set[:, ExecutionIndex] = submission csv_io.write_delimited_file_single( "../Submissions/FinalBlend_" + now.strftime("%Y%m%d%H%M%S") + ".csv", dataset_blend_test_set.mean(1))
def main(): training, target = csv_io.read_data("../Data/train.csv") training = [x[1:] for x in training] target = [float(x) for x in target] test, throwaway = csv_io.read_data("../Data/test.csv") test = [x[1:] for x in test] svc = svm.SVC(kernel='poly', degree=2) scores = cross_val_score(rf, training, target, cv=10) print np.mean(scores)
def main(): training, target = csv_io.read_data("../Data/train.csv") training = [x[1:] for x in training] target = [float(x) for x in target] test, throwaway = csv_io.read_data("../Data/test.csv") test = [x[1:] for x in test] rf = RandomForestClassifier(n_estimators=150, max_features=0.012) scores = cross_val_score(rf, training, target, cv=10) print np.mean(scores)
def main(): train = csv_io.read_data("../Data/train.csv") target = [x[0] for x in train] train = [x[1:] for x in train] test = csv_io.read_data("../Data/test.csv") rf = RandomForestClassifier(n_estimators=100, min_split=2) rf.fit(train, target) predicted_probs = rf.predict_proba(test) predicted_probs = ["%f" % x[1] for x in predicted_probs] csv_io.write_delimited_file("../Submissions/rf_benchmark.csv", predicted_probs)
def Blend(): trainBase = csv_io.read_data("PreProcessData/training_PreProcess2.csv", False) SEED = 448 random.seed(SEED) random.shuffle(trainBase) target = [x[0] for x in trainBase] dataset_blend_train, dataset_blend_test = stack.run_stack(SEED) clfs = [LogisticRegression(penalty='l2', dual=False, tol=0.0001, C=1.0, fit_intercept=True, intercept_scaling=1, class_weight=None), LogisticRegression(penalty='l2', dual=False, tol=0.0001, C=0.5, fit_intercept=True, intercept_scaling=1, class_weight=None), LogisticRegression(penalty='l2', dual=False, tol=0.0001, C=0.1, fit_intercept=True, intercept_scaling=1, class_weight=None), LogisticRegression(penalty='l1', dual=False, tol=0.0001, C=1.0, fit_intercept=True, intercept_scaling=1, class_weight=None), LogisticRegression(penalty='l1', dual=False, tol=0.0001, C=0.5, fit_intercept=True, intercept_scaling=1, class_weight=None), LogisticRegression(penalty='l1', dual=False, tol=0.0001, C=0.1, fit_intercept=True, intercept_scaling=1, class_weight=None)] test = csv_io.read_data("PreProcessData/test_PreProcess2.csv", False) dataset_blend_test_j = np.zeros((len(test), len(clfs))) for ExecutionIndex, clf in enumerate(clfs): #clf = LogisticRegression() clf.fit(dataset_blend_train, target) submission = clf.predict_proba(dataset_blend_test)[:,1] submission = ["%f" % x for x in submission] now = datetime.datetime.now() csv_io.write_delimited_file_GUID("../Submissions/stack" + now.strftime("%Y%m%d%H%M%S") + ".csv", "PreProcessData/test_PatientGuid.csv", submission) # attempt to score the training set to predict score for blend... probSum = 0.0 trainPrediction = clf.predict_proba(dataset_blend_train)[:,1] for i in range(0, len(trainPrediction)): probX = trainPrediction[i] if ( probX > 0.999): probX = 0.999; if ( probX < 0.001): probX = 0.001; probSum += int(target[i])*log(probX)+(1-int(target[i]))*log(1-probX) print "Train Score: ", (-probSum/len(trainPrediction)) dataset_blend_test_j[:, ExecutionIndex] = submission csv_io.write_delimited_file_GUID_numpy("../Submissions/stack_LG_" + now.strftime("%Y%m%d%H%M%S") + ".csv", "PreProcessData/test_PatientGuid.csv", dataset_blend_test_j.mean(1)) var = raw_input("Enter to terminate.")
def main(): training, target = csv_io.read_data("../Data/train.csv") test, throwaway = csv_io.read_data("../Data/test.csv") n_test = len(test) n_target = len(set(target)) predicted_probs = [[0.001 for x in range(n_target)] for y in range(n_test)] predicted_probs = [["%f" % x for x in y] for y in predicted_probs] csv_io.write_delimited_file("../Submissions/uniform_benchmark.csv", predicted_probs)
def main(): train = csv_io.read_data("../Data/train.csv") targets = [int(x[0]) for x in train] num_targets = len(targets) num_ones = np.sum(targets) optimized_value = float(num_ones) / num_targets test = csv_io.read_data("../Data/test.csv") predicted_probs = ["%f" % optimized_value for x in test] csv_io.write_delimited_file("../Submissions/optimized_value_benchmark.csv", predicted_probs)
def main(): train = csv_io.read_data("../Data/train.csv") target = [x[0] for x in train] train = [x[1:] for x in train] test = csv_io.read_data("../Data/test.csv") svc = svm.SVC(probability=True) svc.fit(train, target) predicted_probs = svc.predict_proba(test) predicted_probs = ["%f" % x[1] for x in predicted_probs] csv_io.write_delimited_file("../Submissions/svm_benchmark.csv", predicted_probs)
def main(): train = csv_io.read_data("../data/train.csv") target = [x[0] for x in train] train = [x[1:] for x in train] test = csv_io.read_data("../data/test.csv") svc = svm.SVC(probability=True) svc.fit(train, target) predicted_probs = svc.predict_proba(test) predicted_probs = ["%f" % x[1] for x in predicted_probs] csv_io.write_delimited_file("../submissions/svm_benchmark.csv", predicted_probs)
def PreProcessRun(dataSet): print print "DataSet: ", dataSet print "Loading Data" data = csv_io.read_data("PreProcessData/" + dataSet + "_PreProcess.csv", split="\t", skipFirstLine=False) print dataSet, "Size: ", len(data[0]) if (dataSet == "training"): # do only once. shutil.copy2("PreProcessData/DataClassList.csv", "PreProcessData/DataClassList1.csv") DataClassList = csv_io.read_data("PreProcessData/DataClassList1.csv", False) offset = 0 offset2 = 0 if (dataSet == "test"): offset = 1 offset2 = -1 print DataClassList print "Appending New Data" firstTime = True for row in data: text = "" val = row[136 + offset2] / row[139 + offset2] row.append(val) if (firstTime and dataSet == "training"): # do only once. text = DataClassList[135 + offset][0] + "_DIV_" + DataClassList[ 139 + offset][0] csv_io.write_delimited_file("PreProcessData/DataClassList1.csv", [text], filemode="a") if (firstTime): print row[136 + offset2], row[139 + offset2], val, text firstTime = False csv_io.write_delimited_file("PreProcessData/" + dataSet + "_PreProcess1.csv", data, delimiter="\t") print "Done."
def main(): training, target = csv_io.read_data("../Data/train.csv") training = [x[1:] for x in training] target = [float(x) for x in target] test, throwaway = csv_io.read_data("../Data/test.csv") test = [x[1:] for x in test] svc = svm.SVC(probability=True) svc.fit(training, target) predicted_probs = svc.predict_proba(test) predicted_probs = [[min(max(x,0.001),0.999) for x in y] for y in predicted_probs] predicted_probs = [["%f" % x for x in y] for y in predicted_probs] csv_io.write_delimited_file("../Submissions/svm_benchmark.csv", predicted_probs)
def main(): training, target = csv_io.read_data("../Data/train.csv") training = [x[1:] for x in training] target = [float(x) for x in target] test, throwaway = csv_io.read_data("../Data/test.csv") test = [x[1:] for x in test] svc = svm.SVC(probability=True) svc.fit(training, target) predicted_probs = svc.predict_proba(test) predicted_probs = [[min(max(x, 0.001), 0.999) for x in y] for y in predicted_probs] predicted_probs = [["%f" % x for x in y] for y in predicted_probs] csv_io.write_delimited_file("../Submissions/svm_benchmark.csv", predicted_probs)
def main(): training, target = csv_io.read_data("../Data/train.csv") training = [x[1:] for x in training] target = [float(x) for x in target] test, throwaway = csv_io.read_data("../Data/test.csv") test = [x[1:] for x in test] rf = RandomForestClassifier(n_estimators=100, min_split=2) rf.fit(training, target) predicted_probs = rf.predict_proba(test) predicted_probs = [[min(max(x,0.001),0.999) for x in y] for y in predicted_probs] predicted_probs = [["%f" % x for x in y] for y in predicted_probs] csv_io.write_delimited_file("../Submissions/rf_benchmark.csv", predicted_probs)
def main(): np.random.seed(42) #read in the training file modelone = np.asarray(csv_io.read_data("results/gmm_pca12_6.csv",header=True)) modeltwo = np.asarray(csv_io.read_data("results/random_forest_solution-12pca-4.csv",header=True)) modelthree= np.asarray(csv_io.read_data("results/svm_pca12_5.csv",header=True)) bagmodel = np.column_stack((modelone[:,1], modeltwo[:,1], modelthree[:,1])) bagsum = bagmodel.sum(axis=1) predicted_class = np.zeros(bagsum.shape) predicted_class[bagsum >=2] = 1 predicted_class = ["%d,%d" % (i+1, predicted_class[i]) for i in range(len(predicted_class))] print predicted_class[0:9] print(len(predicted_class)) csv_io.write_delimited_file("results/bagging_solution_7.csv", predicted_class, header=['Id', 'Solution'])
def main(): training, target = csv_io.read_data("../Data/train.csv") training = [x[1:] for x in training] target = [float(x) for x in target] test, throwaway = csv_io.read_data("../Data/test.csv") test = [x[1:] for x in test] rf = RandomForestClassifier(n_estimators=100, min_split=2) rf.fit(training, target) predicted_probs = rf.predict_proba(test) predicted_probs = [[min(max(x, 0.001), 0.999) for x in y] for y in predicted_probs] predicted_probs = [["%f" % x for x in y] for y in predicted_probs] csv_io.write_delimited_file("../Submissions/rf_benchmark.csv", predicted_probs)
def main(strat = False, visualization = False): #read in the training file X = csv_io.read_data("data/train.csv") target = ravel(csv_io.read_data("data/trainLabels.csv")) realtest = csv_io.read_data("data/test.csv") print len(realtest) #pca pca = PCA(n_components=num_pca) pca.fit(X) train = pca.transform(X) test_transformed = pca.transform(realtest) print('performed pca') # random forest code clf = svm.SVC() if strat: print "stratified cross-validation on shuffled data" # adapted from http://stackoverflow.com/a/8281241 crossval = [] for i in range(strat): X, y = shuffle(train, target, random_state=i) skf = StratifiedKFold(y, 10) crossval.append([min(cross_val_score(clf, X, y, cv=skf)), np.median(cross_val_score(clf, X, y, cv=skf)), max(cross_val_score(clf, X, y, cv=skf))]) print crossval if visualization: print "preparing visualization" data_train, data_test, target_train, target_test = train_test_split(train, target, test_size=0.20, random_state=42) plot1 = drawLearningCurve(clf, data_train, target_train, data_test, target_test) pp = PdfPages('figures/learningCurve.pdf') pp.savefig(plot1) pp.close() print('fitting the model') clf.fit(train, target) # run model against test data predicted_class = clf.predict(test_transformed) print predicted_class[0:9] print(len(predicted_class)) print('Writing output') predicted_class = ["%d,%d" % (i+1, predicted_class[i]) for i in range(len(predicted_class))] print predicted_class[0:9] print(len(predicted_class)) csv_io.write_delimited_file("results/svm_pca12_5.csv", predicted_class, header=['Id', 'Solution']) print ('Finished. Exiting.')
def Blend(): trainBase = csv_io.read_data("PreProcessData/training_PreProcess2.csv", False) target = [x[0] for x in trainBase] dataset_blend_train, dataset_blend_test = stack_knn.run_stack() clf = LogisticRegression() clf.fit(dataset_blend_train, target) submission = clf.predict_proba(dataset_blend_test)[:,1] submission = ["%f" % x for x in submission] now = datetime.datetime.now() csv_io.write_delimited_file_GUID("../Submissions/stack" + now.strftime("%Y%m%d%H%M") + ".csv", "PreProcessData/test_PatientGuid.csv", submission) # attempt to score the training set to predict score for blend... probSum = 0.0 trainPrediction = clf.predict_proba(dataset_blend_train)[:,1] for i in range(0, len(trainPrediction)): probX = trainPrediction[i] if ( probX > 0.999): probX = 0.999; if ( probX < 0.001): probX = 0.001; probSum += int(target[i])*log(probX)+(1-int(target[i]))*log(1-probX) print "Train Score: ", (-probSum/len(trainPrediction)) var = raw_input("Enter to terminate.")
def main(): train = csv_io.read_data("{}/Data/train.csv".format(os.getcwd()), True) target = [float(x[0]) for x in train] # Remove the target from the training train = [x[1:] for x in train] # Remove the categoricals that I can't convert for x in train: del x[1] del x[1] del x[5] del x[6] cats = preprocess.enum_categ_data(train, "f", 10) preprocess.strf_to_floats(train, missing="average") # test = csv_io.read_data("{}/Data/test.csv".format(os.getcwd()), True) # # # Remove the categoricals that I can't convert # for x in test: # del x[1] # del x[1] # del x[5] # del x[6] # I can't just run enum_categ_data on test data, need to match the right cat to the right index!!! # cats = preprocess.enum_categ_data(test, 'f', 10) # preprocess.strf_to_floats(test, missing='average') rf = RandomForestClassifier(n_estimators=100, min_samples_split=2) rf.fit(train, target) print rf.score(train, target)
def main(): #read in data, parse into training and target sets data = csv_io.read_data("./hotness_features.csv") target = np.array( [x[0] for x in data] ) train = np.array( [x[1:] for x in data] ) hotness_col = target familaritycol = train[:, 30] mydict = {} familarity_classes = [0.0, 0.2, 0.4, 0.6, 0.8, 1.0] for fclass in familarity_classes: mydict[fclass]=[] print len(target) for i in range(0, len(target)): key = familaritycol[i] if (key < 0.2): keyclass = 0.0 elif (key < 0.4): keyclass = 0.2 elif (key < 0.6): keyclass = 0.4 elif (key < 0.8): keyclass = 0.6 else: keyclass = 0.8 value = hotness_col[i] mydict[keyclass].append(value) print len(mydict[0.0]) + len(mydict[0.2]) + len(mydict[0.4]) + len(mydict[0.6]) + len(mydict[0.8]) plt.hist(mydict[0.8], normed=True) plt.show()
def main(): #read in data, parse into training and target sets data = csv_io.read_data("./filtered_classes.csv") target = np.array( [x[0] for x in data] ) train = np.array( [x[1:] for x in data] ) hotness_col = target print "Length of target", len(hotness_col) col = train[:, 5] mydict = {} mydict[0] = [] mydict[1] = [] for i in range(0, len(hotness_col)): key = hotness_col[i] val = col[i] mydict[key].append(val) mylabels = ['Not Hot','Hot'] print len(mydict[0]), len(mydict[1]) plt.hist([mydict[0], mydict[1]], label=mylabels, normed=True) plt.xlabel('Artist Hotness') plt.ylabel('Frequency of songs(normalized)') plt.legend(loc='upper left') plt.show()
def Blend(): trainBase = csv_io.read_data("PreProcessData/training_PreProcess2.csv", False) target = [x[0] for x in trainBase] dataset_blend_train, dataset_blend_test = stack_gb.run_stack() clf = LogisticRegression() clf.fit(dataset_blend_train, target) submission = clf.predict_proba(dataset_blend_test)[:, 1] submission = ["%f" % x for x in submission] now = datetime.datetime.now() csv_io.write_delimited_file_GUID( "../Submissions/stack_" + now.strftime("%Y%m%d%H%M") + ".csv", "PreProcessData/test_PatientGuid.csv", submission) # attempt to score the training set to predict score for blend... probSum = 0.0 trainPrediction = clf.predict_proba(dataset_blend_train)[:, 1] for i in range(0, len(trainPrediction)): probX = trainPrediction[i] if (probX > 0.999): probX = 0.999 if (probX < 0.001): probX = 0.001 probSum += int( target[i]) * log(probX) + (1 - int(target[i])) * log(1 - probX) print "Train Score: ", (-probSum / len(trainPrediction)) var = raw_input("Enter to terminate.")
def main(): #read in data, parse into training and target sets data = csv_io.read_data("./hotness_features.csv") target = np.array( [x[0] for x in data] ) train = np.array( [x[1:] for x in data] ) plt.plot(train[:,0], target,'ro') plt.xlim(-40,0) plt.show()
def main(): train=csv_io.read_data("Data/train.csv") train=train[0:10000] target=[x[0] for x in train] train=[x[1:] for x in train] realtest=csv_io.read_data("Data/test.csv") forest = RandomForestClassifier(n_estimators = 100) forest=forest.fit(train,target) predicted_probs=forest.predict_proba(realtest) fr=open('Result2.csv','w') fr.write("ImageId,Label"+"\n") count=1 for y in predicted_probs: index, value = max(enumerate(y), key=operator.itemgetter(1)) fr.write(str(count)+","+str(index)+"\n") count+=1 fr.close()
def main(): train=csv_io.read_data("Data/train.csv") train=train[0:3000] target=[x[0] for x in train] train=[x[1:] for x in train] realtest=csv_io.read_data("Data/test.csv") lr=LogisticRegression() lr.fit(train,target) predicted_probs=lr.predict_proba(realtest) fr=open('Results.csv','w') fr.write("ImageId,Label"+"\n") count=1 for y in predicted_probs: index, value = max(enumerate(y), key=operator.itemgetter(1)) fr.write(str(count)+","+str(index)+"\n") count+=1 fr.close()
def main(): #read in data, parse into training and target sets train = csv_io.read_data("./hotness_features_classes.csv") target = np.array( [x[0] for x in train] ) train = np.array( [x[1:] for x in train] ) train_scaled = preprocessing.scale(train) clf = tree.DecisionTreeClassifier(random_state = 0) scores = cross_validation.cross_val_score(clf, train_scaled, target,None, cv=10) print "Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std()/2)
def main(): #read in data, parse into training and target sets data = csv_io.read_data("./hotness_features.csv") target = np.array( [x[0] for x in data] ) train = np.array( [x[1:] for x in data] ) mydict = {} yearcol = train[:, 27] durationcol = train[:, 2] mydict[1960]=[] mydict[1970]=[] mydict[1980]=[] mydict[1990]=[] mydict[2000]=[] mydict[2010]=[] decade2010 = 0 for i in range(0,len(yearcol)) : year = yearcol[i] if ((year >= 1960) and (year < 1970)): decade = 1960 elif ((year >= 1970) and (year < 1980)): decade = 1970 elif ((year >=1980) and (year < 1990)): decade = 1980 elif ((year >= 1990) and (year < 2000)): decade = 1990 elif ((year >= 2000) and (year < 2010)): decade = 2000 else: decade2010 = decade2010 + 1 decade = 2010 mydict[decade].append(durationcol[i]) print decade2010 data = [mydict[1960], mydict[1970], mydict[1980], mydict[1990], mydict[2000], mydict[2010]] labels = ['1960', '1970','1980','1990','2000','2010'] mean60 = np.mean(mydict[1960]) mean70 = np.mean(mydict[1970]) mean80 = np.mean(mydict[1980]) mean90 = np.mean(mydict[1990]) mean2000 = np.mean(mydict[2000]) mean2010 = np.mean(mydict[2010]) plt.hist(data,bins=10, normed=True, label=labels, histtype='bar', cumulative=True) plt.legend() plt.figure(2) plt.plot(labels, [mean60, mean70, mean80, mean90, mean2000, mean2010], "ro") plt.show()
def main(): #read in data, parse into training and target sets data = csv_io.read_data("./filtered_classes_musiconly.csv") target = np.array( [x[0] for x in data] ) train = np.array( [x[1:] for x in data] ) train_scaled = preprocessing.scale(train) clf = SVC(kernel='rbf', C=1000, gamma=0.001) scores = cross_validation.cross_val_score(clf, train_scaled, target, metrics.classification_report, cv=10) print scores
def PreProcessRun(dataSet): print print "DataSet: ", dataSet print "Loading Data" data = csv_io.read_data("PreProcessData/" + dataSet + "_PreProcess1.csv", split="\t" ,skipFirstLine = False) print dataSet, "Size: ", len(data[0]) if ( os.path.exists("PreProcessData/" + dataSet + "_PreProcess2.csv") ): os.remove("PreProcessData/" + dataSet + "_PreProcess2.csv") SkipArr = [0,2,4,172] DataClassList = csv_io.read_data("PreProcessData/DataClassList1.csv", False) DataClassListNew = [] firstTime = True for index, item in enumerate(data): rowNew = [] #print item for index, val in enumerate(item): if dataSet == "training" and (index - 1) in SkipArr: continue elif dataSet == "test" and index in SkipArr: continue rowNew.append(val) #print val if dataSet == "test" and firstTime == True: print DataClassList[index] DataClassListNew.append(DataClassList[index]) csv_io.write_delimited_file("PreProcessData/" + dataSet + "_PreProcess2.csv", [copy.deepcopy(rowNew)], filemode="a", delimiter="\t") firstTime = False if dataSet == "test": csv_io.write_delimited_file("PreProcessData/DataClassList2.csv", DataClassListNew) print "Done."
def PostProcess(): lossThreshold = 4.0 # best seems to be about 4.0 model = "Long-Lat KNN5" #used only for targets values. trainBase = csv_io.read_data("PreProcessData/training_PreProcess4_40.csv", skipFirstLine=False, split="\t") test = csv_io.read_data("PreProcessData/test_PreProcess4_40.csv", False) weights = csv_io.read_data("PreProcessData/Weights.csv", skipFirstLine=False) target = [x[0] for x in trainBase] stackFiles = [] for filename in os.listdir("../predictions"): parts = filename.split("_") if (filename[0:5] == "Stack" and float(parts[2]) < lossThreshold): stackFiles.append(filename) dataset_blend_train = np.zeros((len(trainBase), len(stackFiles))) dataset_blend_test = np.zeros((len(test), len(stackFiles))) print "Loading Data" for fileNum, file in enumerate(stackFiles): print file trn = csv_io.read_data("../predictions/Target_" + file, split=",", skipFirstLine=False) for row, datum in enumerate(trn): dataset_blend_train[row, fileNum] = datum[0] tst = csv_io.read_data("../predictions/" + file, split=",", skipFirstLine=False) for row, datum in enumerate(tst): dataset_blend_test[row, fileNum] = datum[0] np.savetxt('temp/dataset_blend_trainX.txt', dataset_blend_train) np.savetxt('temp/dataset_blend_testX.txt', dataset_blend_test) np.savetxt('temp/dataset_blend_trainY.txt', target) print "Num file processed: ", len(stackFiles), "Threshold: ", lossThreshold
def main(): #read in data, parse into training and target sets data = csv_io.read_data("./filtered_classes.csv") target = np.array( [x[0] for x in data] ) train = np.array( [x[1:] for x in data] ) mydict = {} yearcol = train[:, 9] energycol = train[:, 13] mydict[1960]=[] mydict[1970]=[] mydict[1980]=[] mydict[1990]=[] mydict[2000]=[] mydict[2010]=[] for i in range(0,len(yearcol)) : year = yearcol[i] if ((year >= 1960) and (year < 1970)): decade = 1960 elif ((year >= 1970) and (year < 1980)): decade = 1970 elif ((year >=1980) and (year < 1990)): decade = 1980 elif ((year >= 1990) and (year < 2000)): decade = 1990 elif ((year >= 2000) and (year < 2010)): decade = 2000 else: decade = 2010 mydict[decade].append(energycol[i]) mean60 = np.median(mydict[1960]) mean70 = np.median(mydict[1970]) mean80 = np.median(mydict[1980]) mean90 = np.median(mydict[1990]) mean2000 = np.median(mydict[2000]) mean2010 = np.median(mydict[2010]) print mean60, mean70, mean80, mean90, mean2000, mean2010 print len(mydict[1960]),len(mydict[1970]), len(mydict[1980]), len(mydict[1990]), len(mydict[2000]), len(mydict[2010]) data = [mydict[1960], mydict[1970], mydict[1980], mydict[1990], mydict[2000], mydict[2010]] labels = ['1960', '1970', '1980', '1990', '2000','2010'] plt.hist(data, bins=20, normed=True, label=labels) # plt.hist(mydict[1960],bins=50, normed=True, label='1960',histtype='stepfilled', cumulative=True) # plt.hist(mydict[1980],bins=50, normed=True, label='1980', histtype='stepfilled', cumulative=True) # plt.hist(mydict[2010],bins=50 ,normed=True, label='2010', histtype='stepfilled', cumulative=True) plt.legend(loc='upper left') plt.figure(2) plt.plot(labels, [mean60, mean70, mean80, mean90, mean2000, mean2010]) plt.show()
def main(): et = csv_io.read_data("../Submissions/et_stack_avg_benchmark.csv", False) rbf = csv_io.read_data( "../Submissions/svm-rbf-bootstrap-stack_meanSpan_benchmark.csv", False) poly = csv_io.read_data( "../Submissions/svm-poly-bootstrap-stack_meanSpan_benchmark.csv", False) rf = csv_io.read_data("../Submissions/rf2_avg_benchmark.csv", False) gb = csv_io.read_data("../Submissions/gb_avg_benchmark.csv", False) stack = [] stack.append(et) stack.append(rbf) stack.append(poly) stack.append(rf) stack.append(gb) spanDistance = 3 finalList = [] for p in range(0, len(stack[0])): temp_list = [] for q in range(0, len(stack)): temp_list.append(stack[q][p][0]) avg = sum(temp_list) / float(len(stack)) if (avg < 0.5): finalList.append(0.2) #finalList.append(min(temp_list)) print p, q, temp_list, avg, min(temp_list) else: finalList.append(0.80) #finalList.append(max(temp_list)) print p, q, temp_list, avg, max(temp_list) #finalList.append( meanSpan(temp_list, spanDistance) ) #print p, q, temp_list, meanSpan(temp_list, spanDistance) finalStack = ["%f" % x for x in finalList] csv_io.write_delimited_file("../Submissions/stack.csv", finalStack) var = raw_input("Enter to terminate.")
def main(): et = csv_io.read_data("../Submissions/et_stack_avg_benchmark.csv", False) rbf = csv_io.read_data("../Submissions/svm-rbf-bootstrap-stack_meanSpan_benchmark.csv", False) poly = csv_io.read_data("../Submissions/svm-poly-bootstrap-stack_meanSpan_benchmark.csv", False) rf = csv_io.read_data("../Submissions/rf2_avg_benchmark.csv", False) gb = csv_io.read_data("../Submissions/gb_avg_benchmark.csv", False) stack = [] stack.append(et) stack.append(rbf) stack.append(poly) stack.append(rf) stack.append(gb) spanDistance = 3 finalList = [] for p in range(0, len(stack[0])): temp_list =[] for q in range(0, len(stack)): temp_list.append( stack[q][p][0]) avg = sum(temp_list)/float(len(stack)) if ( avg < 0.5 ): finalList.append(0.2) #finalList.append(min(temp_list)) print p, q, temp_list, avg, min(temp_list) else: finalList.append(0.80) #finalList.append(max(temp_list)) print p, q, temp_list, avg, max(temp_list) #finalList.append( meanSpan(temp_list, spanDistance) ) #print p, q, temp_list, meanSpan(temp_list, spanDistance) finalStack = ["%f" % x for x in finalList] csv_io.write_delimited_file("../Submissions/stack.csv", finalStack) var = raw_input("Enter to terminate.")
def PreProcess3(): filename = "stack201208301510" data = csv_io.read_data("../Submissions/" + filename + ".csv", False) data = SimpleScale( data, floor=0.05, ceiling=0.90) # took 0.389 score an lowered to 0.40, not good... csv_io.write_delimited_file( "../Submissions/" + filename + "_SimpleScale.csv", data)
def main(): #read in data, parse into training and target sets data = csv_io.read_data("./hotness_features_classes.csv") target = np.array( [x[0] for x in data] ) train = np.array( [x[1:] for x in data] ) train_scaled = preprocessing.scale(train) clf = SVC(kernel='linear') selector = RFECV(clf, step=1, cv=10) selector = selector.fit(train_scaled, target) print selector.support_
def main(): #read in data, parse into training and target sets data = csv_io.read_data("./filtered_classes.csv") target = np.array( [x[0] for x in data] ) train = np.array( [x[1:] for x in data] ) train_scaled = preprocessing.scale(train) X_train, X_test, y_train, y_test = cross_validation.train_test_split(train_scaled, target, test_size = 0.8) clf = SVC(kernel='linear', C=0.005).fit(X_train, y_train) print clf.score(X_test, y_test)
def main(): #read in the training file train = csv_io.read_data("train.csv") #set the training responses target = [x[0] for x in train] #set the training features train = [x[1:] for x in train] #read in the test file realtest = csv_io.read_data("test.csv") # code for logistic regression lr = LogisticRegression() lr.fit(train, target) predicted_probs = lr.predict_proba(realtest) # write solutions to file predicted_probs = ["%f" % x[1] for x in predicted_probs] csv_io.write_delimited_file("log_solution.csv", predicted_probs) print ('Logistic Regression Complete! Submit log_solution.csv to Kaggle')
def main(): #read in data, parse into training and target sets data = csv_io.read_data("./filtered_classes_musiconly.csv") target = np.array( [x[0] for x in data] ) train = np.array( [x[1:] for x in data] ) train_scaled = preprocessing.scale(train) X_train, X_test, y_train, y_test = cross_validation.train_test_split(train_scaled, target, test_size = 0.8) clf = SVC(kernel='rbf', C = 1000.0, gamma=0.001).fit(X_train, y_train) y_val_predict = clf.predict(X_test) print metrics.zero_one_score(y_test, y_val_predict)
def PostProcess(): lossThreshold = 4.0 # best seems to be about 4.0 model = "Long-Lat KNN5" #used only for targets values. trainBase = csv_io.read_data("PreProcessData/training_PreProcess4_40.csv", skipFirstLine = False, split = "\t") test = csv_io.read_data("PreProcessData/test_PreProcess4_40.csv", False) weights = csv_io.read_data("PreProcessData/Weights.csv", skipFirstLine = False) target = [x[0] for x in trainBase] stackFiles = [] for filename in os.listdir("../predictions"): parts = filename.split("_") if ( filename[0:5] == "Stack" and float(parts[2]) < lossThreshold): stackFiles.append(filename) dataset_blend_train = np.zeros((len(trainBase), len(stackFiles))) dataset_blend_test = np.zeros((len(test), len(stackFiles))) print "Loading Data" for fileNum, file in enumerate(stackFiles): print file trn = csv_io.read_data("../predictions/Target_" + file, split="," ,skipFirstLine = False) for row, datum in enumerate(trn): dataset_blend_train[row, fileNum] = datum[0] tst = csv_io.read_data("../predictions/" + file, split="," ,skipFirstLine = False) for row, datum in enumerate(tst): dataset_blend_test[row, fileNum] = datum[0] np.savetxt('temp/dataset_blend_trainX.txt', dataset_blend_train) np.savetxt('temp/dataset_blend_testX.txt', dataset_blend_test) np.savetxt('temp/dataset_blend_trainY.txt', target) print "Num file processed: ", len(stackFiles), "Threshold: ", lossThreshold
def mp_worker(fn): data = csv_io.read_data(fn, 0, l=85) np.random.shuffle(np.array(data)) #print len(data[0]) y = [x[len(x) - 1] for x in data] X = [x[0:len(x) - 1] for x in data] #print len(X), len(X[0]) lrf = LBLRFImbalanced(fn, X, y) return
def Analyze1(): Threshold = 4.0 targetFile = "Target_Stack_20121017110223_3.06649134025_GradientBoos.csv" trainBase = csv_io.read_data("PreProcessData/training_PreProcess3.csv", skipFirstLine=False, split="\t") shutil.copy2("PreProcessData/test_PreProcess3.csv", "PreProcessData/test_PreProcess8.csv") shutil.copy2("PreProcessData/DataClassList3.csv", "PreProcessData/DataClassList8.csv") weights = csv_io.read_data("PreProcessData/Weights.csv", skipFirstLine=False) target = [x[0] for x in trainBase] print "Loading Data" trainNew = [] probSum = 0.0 weightSum = 0 trn = csv_io.read_data("../predictions/" + targetFile, split=",", skipFirstLine=False) for row, datum in enumerate(trn): if (abs(datum[0] - target[row]) > Threshold): print datum[0], target[row] trainNew.append(trainBase[row]) probSum += weights[row][0] * math.fabs(target[row] - datum[0]) weightSum += weights[row][0] print "Train Score: ", (probSum / weightSum) print len(trainNew) csv_io.write_delimited_file("PreProcessData/training_PreProcess8" + ".csv", trainNew, delimiter="\t")
def main(): #read in the training file train = csv_io.read_data("train.csv") #set the training responses target = [x[0] for x in train] #set the training features train = [x[1, 3, 4, 5, 6] for x in train] #read in the test file realtest = csv_io.read_data("test.csv") # random forest code rf = RandomForestClassifier(n_estimators=10) # fit the training data print('fitting the model') rf.fit(train, target) # run model against test data predicted_probs = rf.predict_proba(realtest) predicted_probs = ["%f" % x[1] for x in predicted_probs] csv_io.write_delimited_file("random_forest_solution.csv", predicted_probs)
def main(): train = csv_io.read_data("../Data/train.csv") target = [x[0] for x in train][1:] # skip the headers probabilities = csv_io.read_data("../Submissions/svm_benchmark.csv") prob = [x[0] for x in probabilities] probSum = 0 for i in range(0, len(prob)): #tempProb = max(prob[i], 0.000001) #tempProb = min(tempProb, 0.999999) #tempProb = max(prob[i], 0.1) #tempProb = min(tempProb, 0.9) print i, probSum, prob[i], target[i] print target[i] * log(prob[i]), (1 - target[i]) * log(1 - prob[i]) probSum += target[i] * log( prob[i]) + (1 - target[i]) * log(1 - prob[i]) print probSum print len(prob) print -probSum / len(prob) #result = (-1/len(probs))*mySum; var = raw_input("Enter to terminate.")
def getBenchmark(test_label): tfile = open('MyBenchmark.csv', 'wb') test_info = csv_io.read_data('test_info.csv') try: twriter = csv.writer(tfile, delimiter=',') twriter.writerow(['Id', 'Prediction']) index = 0 print np.array(test_info) print test_info[0][0], test_label[0] #print np.array(test_info) for id in test_info: #print 'converted line', index twriter.writerow([int(id[0]), test_label[index]]) index += 1 finally: tfile.close()
def runKmeans(data_file): train_data = csv_io.read_data(data_file) print len(train_data) num_clusters = 10 model = KMeans(init='k-means++', n_clusters=num_clusters, n_init=10) max_score = 0 iteration = 2 best_classification = [] for i in range(1, iteration): print "Iteration number " + str(i) model.fit(train_data) score = model.score(train_data) if i == 1 or score > max_score: max_score = score best_classification = model.predict(train_data) print len(best_classification.tolist()) return best_classification.tolist()
def run_stack(SEED): model = "Long-Lat KNN5 - 50 Features" print "Running GB, RF, ET stack." trainBase = csv_io.read_data("PreProcessData/training_PreProcess4_50.csv", skipFirstLine = False, split = "\t") test = csv_io.read_data("PreProcessData/test_PreProcess4_50.csv", skipFirstLine = False, split = "\t") weights = csv_io.read_data("PreProcessData/Weights.csv", skipFirstLine = False) #random.seed(SEED) #random.shuffle(trainBase) avg = 0 NumFolds = 5 # 5 is good, but 10 yeilds a better mean since outliers are less significant. (note, predictions are less reliable when using 10). predicted_list = [] bootstrapLists = [] # use this for quick runs. # note RF with 150 crashes on 30 features # GradientBoostingRegressor(learn_rate=0.05, subsample=0.5, max_depth=6, n_estimators=50), # GradientBoostingRegressor(learn_rate=0.02, subsample=0.2, max_depth=8, n_estimators=125), # RandomForestRegressor(n_estimators=100, n_jobs=1), #RandomForestRegressor(n_estimators=75, n_jobs=1), # clfs = [ExtraTreesRegressor(n_estimators=50, min_density=0.2, n_jobs=1, bootstrap=True, random_state=1), # SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0, degree=3, gamma=2**-5.5, kernel='rbf', probability=False, shrinking=True, tol=0.001, verbose=False) # ] #knn 5 at 3.45 #knn 15 at 3.31 #knn 25 at 3.30 #knn 40 at 3.31 # KNeighborsRegressor(n_neighbors=5, weights='uniform', algorithm='auto', leaf_size=30, warn_on_equidistant=False, p=2), # KNeighborsRegressor(n_neighbors=15, weights='uniform', algorithm='auto', leaf_size=30, warn_on_equidistant=False, p=2), # KNeighborsRegressor(n_neighbors=25, weights='uniform', algorithm='auto', leaf_size=30, warn_on_equidistant=False, p=2), # LinearRegression at 3.77 # Ridge at 3.77 # SGD 4.23 #Gauss at 13 # LinearRegression(fit_intercept=True, normalize=False, copy_X=True), # Ridge(alpha=1.0, fit_intercept=True, normalize=False, copy_X=True, tol=0.001), # SGDRegressor(loss='squared_loss', penalty='l2', alpha=0.0001, rho=0.84999999999999998, fit_intercept=True, n_iter=5, shuffle=False, verbose=0, epsilon=0.10000000000000001, p=None, seed=0, learning_rate='invscaling', eta0=0.01, power_t=0.25, warm_start=False), # GaussianNB() # clfs = [KNeighborsRegressor(n_neighbors=15, weights='uniform', algorithm='auto', leaf_size=30, warn_on_equidistant=False, p=2), # KNeighborsRegressor(n_neighbors=25, weights='uniform', algorithm='auto', leaf_size=30, warn_on_equidistant=False, p=2),KNeighborsRegressor(n_neighbors=35, weights='uniform', algorithm='auto', leaf_size=30, warn_on_equidistant=False, p=2) # ] # GB, 125 est is minimum, score is bad below this, explore higher and other dimensions. ****************** # clfs = [GradientBoostingRegressor(learn_rate=0.05, subsample=0.5, max_depth=6, n_estimators=100, random_state=166), # GradientBoostingRegressor(learn_rate=0.05, subsample=0.5, max_depth=8, n_estimators=100, random_state=166), # GradientBoostingRegressor(learn_rate=0.05, subsample=0.5, max_depth=10, n_estimators=100, random_state=166), # GradientBoostingRegressor(learn_rate=0.05, subsample=0.5, max_depth=6, n_estimators=200, random_state=166), # GradientBoostingRegressor(learn_rate=0.05, subsample=0.5, max_depth=8, n_estimators=200, random_state=166),GradientBoostingRegressor(learn_rate=0.05, subsample=0.5, max_depth=10, n_estimators=200, random_state=166) # ] # about 1 hour run time, and 3.10 score. #GradientBoostingRegressor(learn_rate=0.05, subsample=0.5, max_depth=6, n_estimators=400, random_state=166) # about 2 hours run time at 3.05 #GradientBoostingRegressor(learn_rate=0.05, subsample=0.5, max_depth=8, n_estimators=400, random_state=166) # about 2 hours run time at 3.06 #GradientBoostingRegressor(learn_rate=0.05, subsample=0.5, max_depth=6, n_estimators=800, random_state=166) # about 4 hours run time at 3.06 #GradientBoostingRegressor(learn_rate=0.05, subsample=0.5, max_depth=8, n_estimators=800, random_state=166) clfs = [GradientBoostingRegressor(learn_rate=0.05, subsample=0.5, max_depth=6, n_estimators=3000, random_state=166) ] # use this for quick runs. # clfs = [GradientBoostingClassifier(learn_rate=0.05, subsample=0.5, max_depth=6, n_estimators=50, random_state=166), # GradientBoostingClassifier(learn_rate=0.02, subsample=0.2, max_depth=8, n_estimators=125, random_state=551), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=80, random_state=441), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=3, n_estimators=80, random_state=331), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=4, n_estimators=80, random_state=221), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=120, random_state=91), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=3, n_estimators=120, random_state=81), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=4, n_estimators=120, random_state=71), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=160, random_state=61), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=3, n_estimators=160, random_state=51), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=4, n_estimators=160, random_state=41), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=1, n_estimators=200, random_state=31), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=200, random_state=21), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=3, n_estimators=200, random_state=10), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=4, n_estimators=200, random_state=19), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=1, n_estimators=240, random_state=18), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=240, random_state=17), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=3, n_estimators=240, random_state=16), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=1, n_estimators=280, random_state=15), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=280, random_state=14), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=3, n_estimators=280, random_state=13), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=1, n_estimators=320, random_state=12), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=320, random_state=11), # RandomForestClassifier(n_estimators=100, n_jobs=1, criterion='gini'), # RandomForestClassifier(n_estimators=100, n_jobs=1, criterion='entropy'), # ExtraTreesClassifier(n_estimators=150, min_density=0.2, n_jobs=1, criterion='gini', bootstrap=True, random_state=1), # ExtraTreesClassifier(n_estimators=150, min_density=0.2, n_jobs=1, criterion='entropy', bootstrap=True, random_state=5)] # use this for quick runs. reduced estimators to 50 # clfs = [SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0, degree=3, # gamma=2**-5.5, kernel='rbf', probability=False, shrinking=True, # tol=0.001, verbose=False) # ] #GradientBoostingRegressor(learn_rate=0.05, subsample=0.5, max_depth=6, n_estimators=50), #ExtraTreesRegressor(n_estimators=50, min_density=0.2, n_jobs=1, bootstrap=True, random_state=1) # clfs = [ExtraTreesClassifier(n_estimators=150, min_density=0.2, n_jobs=1, criterion='gini', bootstrap=True, random_state=1), # ExtraTreesClassifier(n_estimators=150, min_density=0.09, n_jobs=1, criterion='gini', bootstrap=True, random_state=2), # ExtraTreesClassifier(n_estimators=150, min_density=0.2, n_jobs=1, criterion='entropy', bootstrap=True, random_state=5), # ExtraTreesClassifier(n_estimators=150, min_density=0.09, n_jobs=1, criterion='entropy', bootstrap=True, random_state=6), # GradientBoostingClassifier(learn_rate=0.05, subsample=0.5, max_depth=6, n_estimators=50), # GradientBoostingClassifier(learn_rate=0.02, subsample=0.2, max_depth=8, n_estimators=125), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=80), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=3, n_estimators=80), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=4, n_estimators=80), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=120), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=3, n_estimators=120), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=4, n_estimators=120), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=160), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=3, n_estimators=160), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=4, n_estimators=160), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=1, n_estimators=200), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=200), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=3, n_estimators=200), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=4, n_estimators=200), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=1, n_estimators=240), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=240), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=3, n_estimators=240), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=1, n_estimators=280), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=280), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=3, n_estimators=280), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=1, n_estimators=320), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=320), # RandomForestClassifier(n_estimators=150, min_density=0.2, n_jobs=1, criterion='gini', bootstrap=True, random_state=1), # RandomForestClassifier(n_estimators=150, min_density=0.09, n_jobs=1, criterion='gini', bootstrap=True, random_state=2), # RandomForestClassifier(n_estimators=150, min_density=0.2, n_jobs=1, criterion='entropy', bootstrap=True, random_state=5), # RandomForestClassifier(n_estimators=150, min_density=0.05, n_jobs=1, criterion='entropy', bootstrap=True, random_state=7)] # full algorithm stack. # clfs = [ExtraTreesClassifier(n_estimators=150, min_density=0.2, n_jobs=1, criterion='gini', bootstrap=True, random_state=1), # ExtraTreesClassifier(n_estimators=150, min_density=0.09, n_jobs=1, criterion='gini', bootstrap=True, random_state=2), # ExtraTreesClassifier(n_estimators=150, min_density=0.05, n_jobs=1, criterion='gini', bootstrap=True, random_state=3), # ExtraTreesClassifier(n_estimators=150, min_density=0.02, n_jobs=1, criterion='gini', bootstrap=True, random_state=4), # ExtraTreesClassifier(n_estimators=150, min_density=0.2, n_jobs=1, criterion='entropy', bootstrap=True, random_state=5), # ExtraTreesClassifier(n_estimators=150, min_density=0.09, n_jobs=1, criterion='entropy', bootstrap=True, random_state=6), # ExtraTreesClassifier(n_estimators=150, min_density=0.05, n_jobs=1, criterion='entropy', bootstrap=True, random_state=7), # ExtraTreesClassifier(n_estimators=150, min_density=0.02, n_jobs=1, criterion='entropy', bootstrap=True, random_state=8), # GradientBoostingClassifier(learn_rate=0.05, subsample=0.5, max_depth=6, n_estimators=50), # GradientBoostingClassifier(learn_rate=0.02, subsample=0.2, max_depth=8, n_estimators=125), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=80), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=3, n_estimators=80), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=4, n_estimators=80), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=120), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=3, n_estimators=120), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=4, n_estimators=120), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=160), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=3, n_estimators=160), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=4, n_estimators=160), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=1, n_estimators=200), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=200), # GradientBoostingClassifier(lesarn_rate=0.01, subsample=0.2, max_depth=3, n_estimators=200), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=4, n_estimators=200), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=1, n_estimators=240), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=240), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=3, n_estimators=240), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=1, n_estimators=280), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=280), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=3, n_estimators=280), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=1, n_estimators=320), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=320), # RandomForestClassifier(n_estimators=150, min_density=0.2, n_jobs=1, criterion='gini', bootstrap=True, random_state=1), # RandomForestClassifier(n_estimators=150, min_density=0.09, n_jobs=1, criterion='gini', bootstrap=True, random_state=2), # RandomForestClassifier(n_estimators=150, min_density=0.05, n_jobs=1, criterion='gini', bootstrap=True, random_state=3), # RandomForestClassifier(n_estimators=150, min_density=0.02, n_jobs=1, criterion='gini', bootstrap=True, random_state=4), # RandomForestClassifier(n_estimators=150, min_density=0.2, n_jobs=1, criterion='entropy', bootstrap=True, random_state=5), # RandomForestClassifier(n_estimators=150, min_density=0.09, n_jobs=1, criterion='entropy', bootstrap=True, random_state=6), # RandomForestClassifier(n_estimators=150, min_density=0.05, n_jobs=1, criterion='entropy', bootstrap=True, random_state=7), # RandomForestClassifier(n_estimators=150, min_density=0.02, n_jobs=1, criterion='entropy', bootstrap=True, random_state=8)] print "Data size: ", len(trainBase), len(test) dataset_blend_train = np.zeros((len(trainBase), len(clfs))) dataset_blend_test = np.zeros((len(test), len(clfs))) trainNew = [] trainTestNew = [] testNew = [] trainNewSelect = [] trainTestNewSelect = [] testNewSelect = [] print "Scaling" targetPre = [x[0] for x in trainBase] trainPre = [x[1:] for x in trainBase] testPre = [x[0:] for x in test] #print trainPre[0] scaler = preprocessing.Scaler().fit(trainPre) trainScaled = scaler.transform(trainPre) testScaled = scaler.transform(testPre) #print scaler.mean_ #print scaler.std_ print "Begin Training" for ExecutionIndex, clf in enumerate(clfs): print str(clf) avg = 0 predicted_list = [] dataset_blend_test_set = np.zeros((len(test), NumFolds)) foldCount = 0 #Stratified for classification...[trainBase[i][0] for i in range(len(trainBase))] Folds = cross_validation.KFold(len(trainBase), k=NumFolds, indices=True) for train_index, test_index in Folds: #trainBaseTemp = [trainBase[i] for i in train_index] #target = [x[0] for x in trainBaseTemp] #train = [x[1:] for x in trainBaseTemp] #testBaseTemp = [trainBase[i] for i in test_index] #targetTest = [x[0] for x in testBaseTemp] #trainTest = [x[1:] for x in testBaseTemp] #test = [x[0:] for x in test] target = [targetPre[i] for i in train_index] train = [trainScaled[i] for i in train_index] targetTest = [targetPre[i] for i in test_index] trainTest = [trainScaled[i] for i in test_index] print print "Iteration: ", foldCount print "LEN: ", len(train), len(target) clf.fit(train, target) prob = clf.predict(trainTest) dataset_blend_train[test_index, ExecutionIndex] = prob probSum = 0 weightSum = 0 # totalOffByHalf = 0 # totalPositive = 0 # totalPositiveOffByHalf = 0 # totalPositivePredictions = 0 for i in range(0, len(prob)): probX = prob[i] probSum += weights[test_index[i]][0] * math.fabs(targetTest[i] - probX) weightSum += weights[test_index[i]][0] #print "Weight", weights[test_index[i]][0], "Index: ",i, "Test_Index: ",test_index[i] , "Actual: ", targetTest[i], "Predicted: ", probX # log loss cal #probSum += int(targetTest[i])*log(probX)+(1-int(targetTest[i]))*log(1-probX) # if ( math.fabs(probX - int(targetTest[i])) > 0.5 ): # totalOffByHalf = totalOffByHalf + 1 # if ( int(targetTest[i]) == 1 ): # totalPositive = totalPositive + 1 # if ( int(targetTest[i]) == 1 and probX < 0.5): # totalPositiveOffByHalf = totalPositiveOffByHalf + 1 # if (probX > 0.5): # totalPositivePredictions = totalPositivePredictions + 1 # print # print "Stats:" # print "Total Off By > 0.5 ", totalOffByHalf # print "Total Positive ", totalPositive # print "Total Positive Off By Half ", totalPositiveOffByHalf # print "Total Positive Predictions ", totalPositivePredictions #print -probSum/len(prob) print "Score: ", probSum/weightSum avg += (probSum/weightSum)/NumFolds predicted_probs = clf.predict(testScaled) #predicted_list.append([x[1] for x in predicted_probs]) dataset_blend_test_set[:, foldCount] = predicted_probs #[0] foldCount = foldCount + 1 dataset_blend_test[:,ExecutionIndex] = dataset_blend_test_set.mean(1) #print "Saving NP" #np.savetxt('temp/dataset_blend_test_set.txt', dataset_blend_test_set) #np.savetxt('temp/dataset_blend_test_set.mean.txt', dataset_blend_test_set.mean(1) ) #np.savetxt('temp/dataset_blend_test.txt', dataset_blend_test) #print "Done Saving NP" now = datetime.datetime.now() #print dataset_blend_test_set.mean(1) csv_io.write_delimited_file_single("../predictions_50/Stack_" + now.strftime("%Y%m%d%H%M%S") + "_" + str(avg) + "_" + str(clf)[:12] + ".csv", dataset_blend_test_set.mean(1)) csv_io.write_delimited_file_single("../predictions_50/Target_Stack_" + now.strftime("%Y%m%d%H%M%S") + "_" + str(avg) + "_" + str(clf)[:12] + ".csv", dataset_blend_train[:,ExecutionIndex] ) csv_io.write_delimited_file("../predictions_40/RunLog.csv", [now.strftime("%Y %m %d %H %M %S"), str(avg), str(clf), str(NumFolds), model, "", ""], filemode="a",delimiter=",") print now print "------------------------Average: ", avg #np.savetxt('temp/dataset_blend_train.txt', dataset_blend_train) return dataset_blend_train, dataset_blend_test
def PreProcess3(): trainBase = csv_io.read_data("PreProcessData/training_PreProcess2.csv", split="\t", skipFirstLine=False) test = csv_io.read_data("PreProcessData/test_PreProcess2.csv", split="\t", skipFirstLine=False) weights = csv_io.read_data("PreProcessData/Weights.csv", skipFirstLine=False) print "Train Size: ", len(trainBase[0]), "Test Size: ", len(test[0]) shutil.copy2("PreProcessData/DataClassList2.csv", "PreProcessData/DataClassList3.csv") lat = len(trainBase[0]) - 2 long = len(trainBase[0]) - 1 target = [x[0] for x in trainBase] train = [x[lat:long + 1] for x in trainBase] n_neighborsArr = [5] leaf_sizeArr = [30] for n_neighbor in n_neighborsArr: for leaf_s in leaf_sizeArr: print "Training neighbors: ", n_neighbor, "leaf_size: ", leaf_s neigh = KNeighborsRegressor(n_neighbors=n_neighbor, warn_on_equidistant=False, leaf_size=leaf_s, algorithm="ball_tree", weights=myFunc) neigh.fit(train, target) probSum = 0 weightSum = 0 for index, data in enumerate(trainBase): pred = neigh.predict([data[lat], data[long]]) #print data[lat], data[long], "Prediction: ", pred[0], "Target: ", target[index] if (len(n_neighborsArr) == 1): trainBase[index].append(pred[0]) probSum += weights[index][0] * math.fabs(target[index] - pred[0]) weightSum += weights[index][0] print "Score: ", probSum / weightSum if (len(n_neighborsArr) > 1): continue for index, data in enumerate(test): pred = neigh.predict([data[lat - 1], data[long - 1]]) #print data[lat - 1], data[long - 1], "Prediction: ", pred[0] if (len(n_neighborsArr) == 1): test[index].append(pred[0]) if (len(n_neighborsArr) > 1): return with open("PreProcessData/DataClassList3.csv", "a") as myfile: myfile.write("Lat-Long-Predictor\n") print "Writing Data" csv_io.write_delimited_file("PreProcessData/training_PreProcess3.csv", trainBase, delimiter="\t") csv_io.write_delimited_file("PreProcessData/test_PreProcess3.csv", test, delimiter="\t") print "Done."
def run_stack(SEED): model = "Lasso" lossThreshold = 0.38 trainBaseTarget = pd.read_csv('../preprocessdata/pre_shuffled_target.csv') trainBaseOrig = pd.read_csv('../models/' + model + '_train.csv') trainBaseWeight = trainBaseOrig['var11'] testOrig = pd.read_csv('../models/' + model + '_test.csv') targetBase = np.nan_to_num(np.array(trainBaseTarget)) trainBaseID = trainBaseOrig['id'] testID = testOrig['id'] avg = 0 NumFolds = 5 stackFiles = [] for filename in os.listdir("../predictions"): parts = filename.split("_") if (filename[0:5] == "Stack" and float(parts[2]) > lossThreshold): stackFiles.append(filename) trainBase = np.zeros((len(trainBaseOrig), len(stackFiles))) test = np.zeros((len(testOrig), len(stackFiles))) print("Loading Data") for fileNum, file in enumerate(stackFiles): print(file) trn = csv_io.read_data( "../predictions/Target_" + file, split=",", skipFirstLine=True) # skip first because of header. for row, datum in enumerate(trn): trainBase[row, fileNum] = datum[1] # -1 because we skil tst = csv_io.read_data( "../predictions/" + file, split=",", skipFirstLine=True) # skip first because of header. for row, datum in enumerate(tst): test[row, fileNum] = datum[1] np.savetxt('temp/dataset_blend_train.txt', trainBase) np.savetxt('temp/dataset_blend_test.txt', test) print("Num file processed: " + " " + str(len(stackFiles)) + " " + "Threshold: " + str(lossThreshold)) print("Starting Scale") allVals = np.vstack((trainBase, test)) scl = StandardScaler(copy=True, with_mean=True, with_std=True) scl.fit(allVals) # should fit on the combined sets. trainBase = scl.transform(trainBase) test = scl.transform(test) print("Starting Blend") clfs = [ #GradientBoostingRegressor(loss='ls', learning_rate=0.05, subsample=0.5, max_depth=10, n_estimators=30, random_state=166, min_samples_leaf=1), Lasso(alpha=0.000016681005372000593), #Ridge(), #LinearRegression(fit_intercept=True, normalize=False, copy_X=True) ] print("Data size: " + str(len(trainBase)) + " " + str(len(test))) dataset_blend_train = np.zeros((len(trainBase), len(clfs))) dataset_blend_test = np.zeros((len(test), len(clfs))) print("Begin Training") lenTrainBase = len(trainBase) lenTest = len(test) gc.collect() for ExecutionIndex, clf in enumerate(clfs): print(clf) avg = 0 dataset_blend_test_set = np.zeros((lenTest, NumFolds)) foldCount = 0 Folds = cross_validation.KFold(lenTrainBase, n_folds=NumFolds, indices=True) for train_index, test_index in Folds: print() print("Iteration: " + str(foldCount)) now = datetime.datetime.now() print(now.strftime("%Y/%m/%d %H:%M:%S")) target = [targetBase[i] for i in train_index] train = [trainBase[i] for i in train_index] weight = [trainBaseWeight[i] for i in train_index] targetTest = [targetBase[i] for i in test_index] trainTest = [trainBase[i] for i in test_index] weightTest = [trainBaseWeight[i] for i in test_index] #print "LEN: ", len(train), len(target) target = np.array(np.reshape(target, (-1, 1))) #train = np.array(np.reshape(train, (-1, 1)) ) weight = np.array(np.reshape(weight, (-1, 1))) targetTest = np.array(np.reshape(targetTest, (-1, 1))) #trainTest = np.array(np.reshape(trainTest, (-1, 1)) ) weightTest = np.array(np.reshape(weightTest, (-1, 1))) #clf.fit(train, target, sample_weight = weight clf.fit(train, target) predicted = clf.predict(trainTest) #print(predicted[:,0]) print(predicted) dataset_blend_train[ test_index, ExecutionIndex] = predicted #[:,0] #needed for Ridge #print(targetTest.shape) #print(prpredictedob.shape) #print(weightTest.shape) print( str( score.normalized_weighted_gini(targetTest.ravel(), predicted.ravel(), weightTest.ravel()))) avg += score.normalized_weighted_gini( targetTest.ravel(), predicted.ravel(), weightTest.ravel()) / NumFolds #print(str(score.normalized_weighted_gini(targetTest.ravel(), predicted[:,0], weightTest.ravel()))) #avg += score.normalized_weighted_gini(targetTest.ravel(), predicted[:,0], weightTest.ravel())/NumFolds predicted = clf.predict(test) dataset_blend_test_set[:, foldCount] = predicted #[:,0] foldCount = foldCount + 1 #break dataset_blend_test[:, ExecutionIndex] = dataset_blend_test_set.mean(1) now = datetime.datetime.now() #print dataset_blend_test_set.mean(1) #csv_io.write_delimited_file_single("../predictions/Stack_" + now.strftime("%Y%m%d%H%M%S") + "_" + str(avg) + "_" + str(clf)[:12] + ".csv", dataset_blend_test_set.mean(1)) submission = pd.DataFrame(np.zeros((len(testID), 2)), columns=['id', 'target']) submission['target'] = dataset_blend_test[:, ExecutionIndex] submission['id'] = testID submission.to_csv("../submission/Blend_" + now.strftime("%Y%m%d%H%M%S") + "_" + str(avg) + "_" + str(clf)[:12] + ".csv", index=False) #csv_io.write_delimited_file_single("../predictions/Target_Stack_" + now.strftime("%Y%m%d%H%M%S") + "_" + str(avg) + "_" + str(clf)[:12] + ".csv", dataset_blend_train[:,ExecutionIndex] ) submission = pd.DataFrame(np.zeros((len(trainBaseID), 2)), columns=['id', 'target']) submission['target'] = dataset_blend_train[:, ExecutionIndex] submission['id'] = trainBaseID submission.to_csv("../submission/Target_Blend_" + now.strftime("%Y%m%d%H%M%S") + "_" + str(avg) + "_" + str(clf)[:12] + ".csv", index=False) csv_io.write_delimited_file("../log/RunLogBlend.csv", [ now.strftime("%Y %m %d %H %M %S"), "AVG.", str(avg), str(clf), "Folds:", str(NumFolds), "Model", "Blend", "Stacks: ", stackFiles ], filemode="a", delimiter=",") print("------------------------Average: " + str(avg)) #np.savetxt('temp/dataset_blend_train.txt', dataset_blend_train) return dataset_blend_train, dataset_blend_test
def main(): trainBase = csv_io.read_data("PreProcessData/PreProcess2.csv", False) avg = 0 NumFolds = 5 # should be odd for median predicted_list = [] spanDistance = 12 bootstrapLists = [] NeighborsArray = [10] for Neighbors in NeighborsArray: predicted_list = [] Folds = cross_validation.KFold(len(trainBase) - 1, k=NumFolds, indices=True, shuffle=False, random_state=None) for train_index, test_index in Folds: trainBaseTemp = [trainBase[i + 1] for i in train_index] #trainBaseTemp = trainBase target = [x[0] for x in trainBaseTemp] train = [x[1:] for x in trainBaseTemp] testBaseTemp = [trainBase[i + 1] for i in test_index] #testBaseTemp = trainBase targetTest = [x[0] for x in testBaseTemp] trainTest = [x[1:] for x in testBaseTemp] test = csv_io.read_data("PreProcessData/PreTestData2.csv", False) test = [x[0:] for x in test] kn = neighbors.KNeighborsClassifier(n_neighbors=Neighbors, weights='distance', algorithm='brute', leaf_size=100, warn_on_equidistant=True, p=2) kn.fit(train, target) prob = kn.predict(trainTest) prob = SimpleScale(prob) # scale output probababilities probSum = 0 totalOffByHalf = 0 totalPositive = 0 totalPositiveOffByHalf = 0 totalPositivePredictions = 0 for i in range(0, len(prob)): probX = prob[i][1] # [1] if (probX > 0.999): probX = 0.999 if (probX < 0.001): probX = 0.001 #print i, probSum, probX, targetTest[i] #print target[i]*log(probX), (1-target[i])*log(1-probX) probSum += int(targetTest[i]) * log(probX) + ( 1 - int(targetTest[i])) * log(1 - probX) if (math.fabs(probX - int(targetTest[i])) > 0.5): totalOffByHalf = totalOffByHalf + 1 if (int(targetTest[i]) == 1): totalPositive = totalPositive + 1 if (int(targetTest[i]) == 1 and probX < 0.5): totalPositiveOffByHalf = totalPositiveOffByHalf + 1 if (probX > 0.5): totalPositivePredictions = totalPositivePredictions + 1 print "Total Off By > 0.5 ", totalOffByHalf print "Total Positive ", totalPositive print "Total Positive Off By Half ", totalPositiveOffByHalf print "Total Positive Predictions ", totalPositivePredictions print "Neighbors: ", Neighbors print -probSum / len(prob) avg += (-probSum / len(prob)) / NumFolds predicted_probs = kn.predict(test) # was test prob = SimpleScale(prob) # scale output probababilities predicted_list.append([x[1] for x in predicted_probs]) avg_list = [] med_list = [] # For N folds, get the average/median for each prediction item in test set. for p in range(0, len(test)): temp_list = [] for q in range(0, len(predicted_list)): temp_list.append(predicted_list[q][p]) avg_list.append(mean(temp_list)) med_list.append(getMedian(temp_list)) #print p, q, temp_list, mean(temp_list), getMedian(temp_list) bootstrapLists.append(avg_list) # This would be used if we ran multiple runs with different training values. # Primitive stacking, should rather save data, and do formal stacking. if (len(bootstrapLists) > 1): finalList = [] for p in range(0, len(test)): temp_list = [] for q in range(0, len(bootstrapLists)): temp_list.append(bootstrapLists[q][p]) finalList.append(meanSpan(temp_list, spanDistance)) print p, q, temp_list, meanSpan(temp_list, spanDistance) else: finalList = bootstrapLists[0] avg_values = ["%f" % x for x in finalList] csv_io.write_delimited_file("../Submissions/rf2_stack_avg.csv", avg_values) print "Average: ", avg var = raw_input("Enter to terminate.")
def PreProcess4(N_Features): trainBase = csv_io.read_data("PreProcessData/training_PreProcess3.csv", skipFirstLine=False, split="\t") test = csv_io.read_data("PreProcessData/test_PreProcess3.csv", skipFirstLine=False, split="\t") shutil.copy2("PreProcessData/DataClassList3.csv", "PreProcessData/DataClassList4.csv") target = [x[0] for x in trainBase] train = [x[1:] for x in trainBase] DataClassList = csv_io.read_data("PreProcessData/DataClassList4.csv", False) print "Data len: ", len(train[0]) print "DataClassList len: ", len(DataClassList) #return # this seems about optimal, but has not been tuned on latest improvements. NumFeatures = N_Features # NOTE going from 30 to 20 features on KNN5 set has almost no effect. Down to 15 is significant loss. # for GBM at 6 and 400 30 is 3.01 and 30 3.05. print "Scaling" term = 5000 # scaler has memory errors between 5000 and 10000 #term = len(trainBase) targetPre = [x[0] for x in trainBase][0:term] trainPre = [x[1:] for x in trainBase][0:term] #testPre = [x[0:] for x in test][0:term] targetPre = target[0:term] #print trainPre[term - 1] scaler = preprocessing.Scaler().fit(trainPre) trainScaled = scaler.transform(trainPre) #testScaled = scaler.transform(testPre) #clf = RandomForestRegressor(n_estimators=25, n_jobs=1,compute_importances=True) clf = GradientBoostingRegressor(loss='ls', learn_rate=0.05, subsample=0.5, max_depth=6, n_estimators=400, random_state=166, min_samples_leaf=30) print "Training" clf.fit(trainScaled, targetPre) trainNew = [] testNew = [] print "Computing Importances" importances = clf.feature_importances_ DataClassListNew = [] for DataIndex, DataClass in enumerate(DataClassList): print DataClass[0], importances[DataIndex] DataClassListNew.append([DataClass[0], importances[DataIndex]]) csv_io.write_delimited_file( "PreProcessData/DataClassList_Importances_" + str(NumFeatures) + ".csv", DataClassListNew) DataClassListNew_temp = sorted(DataClassListNew, key=operator.itemgetter(1), reverse=True) csv_io.write_delimited_file( "PreProcessData/DataClassList_Importances_sorted_" + str(NumFeatures) + ".csv", DataClassListNew_temp) importancesTemp = sorted(importances, reverse=True) print len(importancesTemp), "importances" if (len(importancesTemp) > NumFeatures): threshold = importancesTemp[NumFeatures] print "Importance threshold: ", threshold rowIndex = 0 for row in train: newRow = [] for impIndex, importance in enumerate(importances): if (impIndex == 0): newRow.append(target[rowIndex]) if (importance > threshold): newRow.append(row[impIndex]) trainNew.append(newRow) rowIndex += 1 for row in test: newRow = [] for impIndex, importance in enumerate(importances): if (importance > threshold): newRow.append(row[impIndex]) testNew.append(newRow) csv_io.write_delimited_file("PreProcessData/training_PreProcess4_" + str(NumFeatures) + ".csv", trainNew, delimiter="\t") csv_io.write_delimited_file("PreProcessData/test_PreProcess4_" + str(NumFeatures) + ".csv", testNew, delimiter="\t")
def Blend(): trainBase = csv_io.read_data("PreProcessData/training_PreProcess2.csv", False) SEED = 448 random.seed(SEED) random.shuffle(trainBase) target = [x[0] for x in trainBase] dataset_blend_train, dataset_blend_test = stack.run_stack(SEED) clfs = [ LogisticRegression(penalty='l2', dual=False, tol=0.0001, C=1.0, fit_intercept=True, intercept_scaling=1, class_weight=None), LogisticRegression(penalty='l2', dual=False, tol=0.0001, C=0.5, fit_intercept=True, intercept_scaling=1, class_weight=None), LogisticRegression(penalty='l2', dual=False, tol=0.0001, C=0.1, fit_intercept=True, intercept_scaling=1, class_weight=None), LogisticRegression(penalty='l1', dual=False, tol=0.0001, C=1.0, fit_intercept=True, intercept_scaling=1, class_weight=None), LogisticRegression(penalty='l1', dual=False, tol=0.0001, C=0.5, fit_intercept=True, intercept_scaling=1, class_weight=None), LogisticRegression(penalty='l1', dual=False, tol=0.0001, C=0.1, fit_intercept=True, intercept_scaling=1, class_weight=None) ] test = csv_io.read_data("PreProcessData/test_PreProcess2.csv", False) dataset_blend_test_j = np.zeros((len(test), len(clfs))) for ExecutionIndex, clf in enumerate(clfs): #clf = LogisticRegression() clf.fit(dataset_blend_train, target) submission = clf.predict_proba(dataset_blend_test)[:, 1] submission = ["%f" % x for x in submission] now = datetime.datetime.now() csv_io.write_delimited_file_GUID( "../Submissions/stack" + now.strftime("%Y%m%d%H%M%S") + ".csv", "PreProcessData/test_PatientGuid.csv", submission) # attempt to score the training set to predict score for blend... probSum = 0.0 trainPrediction = clf.predict_proba(dataset_blend_train)[:, 1] for i in range(0, len(trainPrediction)): probX = trainPrediction[i] if (probX > 0.999): probX = 0.999 if (probX < 0.001): probX = 0.001 probSum += int( target[i]) * log(probX) + (1 - int(target[i])) * log(1 - probX) print "Train Score: ", (-probSum / len(trainPrediction)) dataset_blend_test_j[:, ExecutionIndex] = submission csv_io.write_delimited_file_GUID_numpy( "../Submissions/stack_LG_" + now.strftime("%Y%m%d%H%M%S") + ".csv", "PreProcessData/test_PatientGuid.csv", dataset_blend_test_j.mean(1)) var = raw_input("Enter to terminate.")