def run_stack(SEED): trainBaseTarget = pd.read_csv('../preprocessdata/pre_shuffled_target.csv') trainBase = pd.read_csv('../preprocessdata/pre_departition_train.csv') trainBaseWeight = trainBase['var11'] #test = pd.read_csv('../data/pre_shuffled_test.csv') columns = trainBase.columns.values.tolist() columnsHighScore = trainBase.columns.values.tolist() print(trainBase.columns) trainBase = np.nan_to_num(np.array(trainBase)) targetBase = np.nan_to_num(np.array(trainBaseTarget)) #test = np.nan_to_num(np.array(test)) gc.collect() avg = 0 avgLast = -1 NumFolds = 5 clf = Lasso(alpha=0.00010) # found with tune_lasso.py print("Data size: " + str(len(trainBase))) print("Begin Training") lenTrainBase = len(trainBase) gc.collect() featuresRemaining = [] avgScore = [] while True: print(clf) avg = 0 coef_dataset = np.zeros((len(columns), NumFolds)) foldCount = 0 Folds = cross_validation.KFold(lenTrainBase, n_folds=NumFolds, indices=True) for train_index, test_index in Folds: print() print("Iteration: " + str(foldCount)) now = datetime.datetime.now() print(now.strftime("%Y/%m/%d %H:%M:%S")) target = [targetBase[i] for i in train_index] train = [trainBase[i] for i in train_index] weight = [trainBaseWeight[i] for i in train_index] targetTest = [targetBase[i] for i in test_index] trainTest = [trainBase[i] for i in test_index] weightTest = [trainBaseWeight[i] for i in test_index] #print "LEN: ", len(train), len(target) target = np.array(np.reshape(target, (-1, 1))) #train = np.array(np.reshape(train, (-1, 1)) ) weight = np.array(np.reshape(weight, (-1, 1))) targetTest = np.array(np.reshape(targetTest, (-1, 1))) #trainTest = np.array(np.reshape(trainTest, (-1, 1)) ) weightTest = np.array(np.reshape(weightTest, (-1, 1))) #clf.fit(train, target, sample_weight = weight clf.fit(train, target) predicted = clf.predict(trainTest) print( str( score.normalized_weighted_gini(targetTest.ravel(), predicted.ravel(), weightTest.ravel()))) avg += score.normalized_weighted_gini( targetTest.ravel(), predicted.ravel(), weightTest.ravel()) / NumFolds coef_dataset[:, foldCount] = clf.coef_ foldCount = foldCount + 1 coefs = coef_dataset.mean(1) sorted_coefs = sorted( map(abs, coefs)) # must start by removing coefficients closest to zero. print(coefs) print("len coefs: " + str(len(sorted_coefs))) if len(sorted_coefs) < 5: break threshold = sorted_coefs[5] print(str(len(columns))) print(trainBase.shape) toDrop = [] # hey, cannot drop var11 and id columns for index in range( len(coefs) - 1, -1, -1): # must reverse columns all shift to lower numbers. if abs(coefs[index] ) <= threshold and columns[index] != "var11" and columns[ index] != "id": # abs(), remove closest to zero. print("Drop: " + str(index) + " " + columns[index] + " " + str(coefs[index])) #trainBase = np.delete(trainBase,[index], axis=1) toDrop.append(index) #print(columns) if columns[index] in columns: columns.remove(columns[index]) #print(columns) print("start drop") trainBase = np.delete(trainBase, toDrop, axis=1) print("End drop") if avg > avgLast: print("Saving Copy " + str(avgLast) + " " + str(avg)) avgLast = avg columnsHighScore = columns.copy() print("Threshold: " + str(threshold)) print("------------------------Average: " + str(avg)) print(columnsHighScore) print(str(len(columns))) print(trainBase.shape) featuresRemaining.append(len(columns)) avgScore.append(avg) #break gc.collect() trainBase = pd.read_csv('../preprocessdata/pre_departition_train.csv') trainBase = trainBase.loc[:, columnsHighScore] trainBase.to_csv("../models/" + str(clf)[:5] + "_train.csv", index=False) gc.collect() test = pd.read_csv('../preprocessdata/pre_departition_test.csv') test = test.loc[:, columnsHighScore] test.to_csv("../models/" + str(clf)[:5] + "_test.csv", index=False) print(columnsHighScore) print(featuresRemaining) print(avgScore)
def run_stack(SEED): model = "Lasso" lossThreshold = 0.38 trainBaseTarget = pd.read_csv('../preprocessdata/pre_shuffled_target.csv') trainBaseOrig = pd.read_csv('../models/' + model + '_train.csv') trainBaseWeight = trainBaseOrig['var11'] testOrig = pd.read_csv('../models/' + model + '_test.csv') targetBase = np.nan_to_num(np.array(trainBaseTarget)) trainBaseID = trainBaseOrig['id'] testID = testOrig['id'] avg = 0 NumFolds = 5 stackFiles = [] for filename in os.listdir("../predictions"): parts = filename.split("_") if (filename[0:5] == "Stack" and float(parts[2]) > lossThreshold): stackFiles.append(filename) trainBase = np.zeros((len(trainBaseOrig), len(stackFiles))) test = np.zeros((len(testOrig), len(stackFiles))) print("Loading Data") for fileNum, file in enumerate(stackFiles): print(file) trn = csv_io.read_data( "../predictions/Target_" + file, split=",", skipFirstLine=True) # skip first because of header. for row, datum in enumerate(trn): trainBase[row, fileNum] = datum[1] # -1 because we skil tst = csv_io.read_data( "../predictions/" + file, split=",", skipFirstLine=True) # skip first because of header. for row, datum in enumerate(tst): test[row, fileNum] = datum[1] np.savetxt('temp/dataset_blend_train.txt', trainBase) np.savetxt('temp/dataset_blend_test.txt', test) print("Num file processed: " + " " + str(len(stackFiles)) + " " + "Threshold: " + str(lossThreshold)) print("Starting Scale") allVals = np.vstack((trainBase, test)) scl = StandardScaler(copy=True, with_mean=True, with_std=True) scl.fit(allVals) # should fit on the combined sets. trainBase = scl.transform(trainBase) test = scl.transform(test) print("Starting Blend") clfs = [ #GradientBoostingRegressor(loss='ls', learning_rate=0.05, subsample=0.5, max_depth=10, n_estimators=30, random_state=166, min_samples_leaf=1), Lasso(alpha=0.000016681005372000593), #Ridge(), #LinearRegression(fit_intercept=True, normalize=False, copy_X=True) ] print("Data size: " + str(len(trainBase)) + " " + str(len(test))) dataset_blend_train = np.zeros((len(trainBase), len(clfs))) dataset_blend_test = np.zeros((len(test), len(clfs))) print("Begin Training") lenTrainBase = len(trainBase) lenTest = len(test) gc.collect() for ExecutionIndex, clf in enumerate(clfs): print(clf) avg = 0 dataset_blend_test_set = np.zeros((lenTest, NumFolds)) foldCount = 0 Folds = cross_validation.KFold(lenTrainBase, n_folds=NumFolds, indices=True) for train_index, test_index in Folds: print() print("Iteration: " + str(foldCount)) now = datetime.datetime.now() print(now.strftime("%Y/%m/%d %H:%M:%S")) target = [targetBase[i] for i in train_index] train = [trainBase[i] for i in train_index] weight = [trainBaseWeight[i] for i in train_index] targetTest = [targetBase[i] for i in test_index] trainTest = [trainBase[i] for i in test_index] weightTest = [trainBaseWeight[i] for i in test_index] #print "LEN: ", len(train), len(target) target = np.array(np.reshape(target, (-1, 1))) #train = np.array(np.reshape(train, (-1, 1)) ) weight = np.array(np.reshape(weight, (-1, 1))) targetTest = np.array(np.reshape(targetTest, (-1, 1))) #trainTest = np.array(np.reshape(trainTest, (-1, 1)) ) weightTest = np.array(np.reshape(weightTest, (-1, 1))) #clf.fit(train, target, sample_weight = weight clf.fit(train, target) predicted = clf.predict(trainTest) #print(predicted[:,0]) print(predicted) dataset_blend_train[ test_index, ExecutionIndex] = predicted #[:,0] #needed for Ridge #print(targetTest.shape) #print(prpredictedob.shape) #print(weightTest.shape) print( str( score.normalized_weighted_gini(targetTest.ravel(), predicted.ravel(), weightTest.ravel()))) avg += score.normalized_weighted_gini( targetTest.ravel(), predicted.ravel(), weightTest.ravel()) / NumFolds #print(str(score.normalized_weighted_gini(targetTest.ravel(), predicted[:,0], weightTest.ravel()))) #avg += score.normalized_weighted_gini(targetTest.ravel(), predicted[:,0], weightTest.ravel())/NumFolds predicted = clf.predict(test) dataset_blend_test_set[:, foldCount] = predicted #[:,0] foldCount = foldCount + 1 #break dataset_blend_test[:, ExecutionIndex] = dataset_blend_test_set.mean(1) now = datetime.datetime.now() #print dataset_blend_test_set.mean(1) #csv_io.write_delimited_file_single("../predictions/Stack_" + now.strftime("%Y%m%d%H%M%S") + "_" + str(avg) + "_" + str(clf)[:12] + ".csv", dataset_blend_test_set.mean(1)) submission = pd.DataFrame(np.zeros((len(testID), 2)), columns=['id', 'target']) submission['target'] = dataset_blend_test[:, ExecutionIndex] submission['id'] = testID submission.to_csv("../submission/Blend_" + now.strftime("%Y%m%d%H%M%S") + "_" + str(avg) + "_" + str(clf)[:12] + ".csv", index=False) #csv_io.write_delimited_file_single("../predictions/Target_Stack_" + now.strftime("%Y%m%d%H%M%S") + "_" + str(avg) + "_" + str(clf)[:12] + ".csv", dataset_blend_train[:,ExecutionIndex] ) submission = pd.DataFrame(np.zeros((len(trainBaseID), 2)), columns=['id', 'target']) submission['target'] = dataset_blend_train[:, ExecutionIndex] submission['id'] = trainBaseID submission.to_csv("../submission/Target_Blend_" + now.strftime("%Y%m%d%H%M%S") + "_" + str(avg) + "_" + str(clf)[:12] + ".csv", index=False) csv_io.write_delimited_file("../log/RunLogBlend.csv", [ now.strftime("%Y %m %d %H %M %S"), "AVG.", str(avg), str(clf), "Folds:", str(NumFolds), "Model", "Blend", "Stacks: ", stackFiles ], filemode="a", delimiter=",") print("------------------------Average: " + str(avg)) #np.savetxt('temp/dataset_blend_train.txt', dataset_blend_train) return dataset_blend_train, dataset_blend_test
def run_stack(SEED): model = "Lasso" trainBaseTarget = pd.read_csv('../data/pre_shuffled_target.csv') trainBase = pd.read_csv('../models/' + model + '_train.csv') trainBaseWeight = trainBase['var11'] test = pd.read_csv('../models/' + model + '_test.csv') #trainBase = shuffle(trainBase, random_state = SEED) print(trainBase.columns) trainBaseID = trainBase['id'] testID = test['id'] trainBase = np.nan_to_num(np.array(trainBase)) targetBase = np.nan_to_num(np.array(trainBaseTarget)) test = np.nan_to_num(np.array(test)) avg = 0 NumFolds = 5 #GradientBoostingRegressor(loss='ls', learning_rate=0.05, subsample=0.5, max_depth=10, n_estimators=30, random_state=166, min_samples_leaf=1), #Ridge() clfs = [ LinearRegression(fit_intercept=True, normalize=False, copy_X=True) #BaggingRegressor(base_estimator=Ridge(), n_estimators=500, max_samples=1.0, max_features=1.0, bootstrap=True, bootstrap_features=False, oob_score=False, n_jobs=1, random_state=None, verbose=0) #AdaBoostRegressor(base_estimator=Ridge(), n_estimators=50, learning_rate=1.0, loss='linear', random_state=None) #Lasso(alpha=0.0000329034456231), #Ridge(), #RandomForestRegressor(n_estimators=3000, criterion='mse', max_depth=None, min_samples_split=2, min_samples_leaf=1, max_features='auto', max_leaf_nodes=None, bootstrap=True, oob_score=False, n_jobs=1, random_state=None, verbose=0, min_density=None, compute_importances=None), #GradientBoostingRegressor(loss='ls', learning_rate=0.05, subsample=0.5, max_depth=10, n_estimators=30, random_state=166, min_samples_leaf=1), #GradientBoostingRegressor(loss='ls', learning_rate=0.05, subsample=0.5, max_depth=10, n_estimators=100, random_state=166, min_samples_leaf=1), #GradientBoostingRegressor(loss='ls', learning_rate=0.05, subsample=0.5, max_depth=10, n_estimators=300, random_state=166, min_samples_leaf=1), #GradientBoostingRegressor(loss='ls', learning_rate=0.05, subsample=0.5, max_depth=10, n_estimators=1000, random_state=166, min_samples_leaf=1), #GradientBoostingRegressor(loss='ls', learning_rate=0.05, subsample=0.5, max_depth=10, n_estimators=3000, random_state=166, min_samples_leaf=1), ] print ("Data size: " + str(len(trainBase)) + " " + str(len(test))) dataset_blend_train = np.zeros((len(trainBase), len(clfs))) dataset_blend_test = np.zeros((len(test), len(clfs))) print("Begin Training") lenTrainBase = len(trainBase) lenTest = len(test) gc.collect() for ExecutionIndex, clf in enumerate(clfs): print(clf) avg = 0 dataset_blend_test_set = np.zeros((lenTest, NumFolds)) foldCount = 0 Folds = cross_validation.KFold(lenTrainBase, n_folds=NumFolds, indices=True) for train_index, test_index in Folds: print() print ("Iteration: " + str(foldCount)) now = datetime.datetime.now() print(now.strftime("%Y/%m/%d %H:%M:%S")) target = [targetBase[i] for i in train_index] train = [trainBase[i] for i in train_index] weight = [trainBaseWeight[i] for i in train_index] targetTest = [targetBase[i] for i in test_index] trainTest = [trainBase[i] for i in test_index] weightTest = [trainBaseWeight[i] for i in test_index] #print "LEN: ", len(train), len(target) target = np.array(np.reshape(target, (-1, 1)) ) #train = np.array(np.reshape(train, (-1, 1)) ) weight = np.array(np.reshape(weight, (-1, 1))) targetTest = np.array(np.reshape(targetTest, (-1, 1)) ) #trainTest = np.array(np.reshape(trainTest, (-1, 1)) ) weightTest = np.array(np.reshape(weightTest, (-1, 1))) #clf.fit(train, target, sample_weight = weight clf.fit(train, target) predicted = clf.predict(trainTest) #print(predicted[:,0]) print(test_index) dataset_blend_train[test_index, ExecutionIndex] = predicted[:,0] #needed for Ridge #print(targetTest.shape) #print(prpredictedob.shape) #print(weightTest.shape) print(str(score.normalized_weighted_gini(targetTest.ravel(), predicted.ravel(), weightTest.ravel()))) avg += score.normalized_weighted_gini(targetTest.ravel(), predicted.ravel(), weightTest.ravel())/NumFolds predicted[predicted[:,0] < 0.0] = 0.0 print(str(score.normalized_weighted_gini(targetTest.ravel(), predicted.ravel(), weightTest.ravel()))) avg += score.normalized_weighted_gini(targetTest.ravel(), predicted.ravel(), weightTest.ravel())/NumFolds predicted = clf.predict(test) dataset_blend_test_set[:, foldCount] = predicted[:,0] foldCount = foldCount + 1 dataset_blend_test[:,ExecutionIndex] = dataset_blend_test_set.mean(1) now = datetime.datetime.now() #print dataset_blend_test_set.mean(1) #csv_io.write_delimited_file_single("../predictions/Stack_" + now.strftime("%Y%m%d%H%M%S") + "_" + str(avg) + "_" + str(clf)[:12] + ".csv", dataset_blend_test_set.mean(1)) submission = pd.DataFrame(np.zeros((len(testID), 2)), columns=['id', 'target']) submission['target'] = dataset_blend_test[:,ExecutionIndex] submission['id'] = testID submission.to_csv("../predictions/Stack_" + now.strftime("%Y%m%d%H%M%S") + "_" + str(avg) + "_" + str(clf)[:12] + ".csv", index = False) #csv_io.write_delimited_file_single("../predictions/Target_Stack_" + now.strftime("%Y%m%d%H%M%S") + "_" + str(avg) + "_" + str(clf)[:12] + ".csv", dataset_blend_train[:,ExecutionIndex] ) submission = pd.DataFrame(np.zeros((len(trainBaseID), 2)), columns=['id', 'target']) submission['target'] = dataset_blend_train[:,ExecutionIndex] submission['id'] = trainBaseID submission.to_csv("../predictions/Target_Stack_" + now.strftime("%Y%m%d%H%M%S") + "_" + str(avg) + "_" + str(clf)[:12] + ".csv", index = False) csv_io.write_delimited_file("../log/RunLog.csv", [now.strftime("%Y %m %d %H %M %S"), "AVG." , str(avg), str(clf), "Folds:", str(NumFolds), "Model", "", "", ""], filemode="a",delimiter=",") print ("------------------------Average: " + str(avg)) #np.savetxt('temp/dataset_blend_train.txt', dataset_blend_train) return dataset_blend_train, dataset_blend_test
def run_stack(SEED): trainBaseTarget = pd.read_csv('../preprocessdata/pre_shuffled_target.csv') trainBase = pd.read_csv('../preprocessdata/pre_departition_train.csv') trainBaseWeight = trainBase['var11'] #test = pd.read_csv('../data/pre_shuffled_test.csv') columns = trainBase.columns.values.tolist() columnsHighScore = trainBase.columns.values.tolist() print(trainBase.columns) trainBase = np.nan_to_num(np.array(trainBase)) targetBase = np.nan_to_num(np.array(trainBaseTarget)) #test = np.nan_to_num(np.array(test)) gc.collect() avg = 0 avgLast = -1 NumFolds = 5 clf = Lasso(alpha=0.00010) # found with tune_lasso.py print ("Data size: " + str(len(trainBase))) print("Begin Training") lenTrainBase = len(trainBase) gc.collect() featuresRemaining = [] avgScore = [] while True: print(clf) avg = 0 coef_dataset = np.zeros((len(columns),NumFolds)) foldCount = 0 Folds = cross_validation.KFold(lenTrainBase, n_folds=NumFolds, indices=True) for train_index, test_index in Folds: print() print ("Iteration: " + str(foldCount)) now = datetime.datetime.now() print(now.strftime("%Y/%m/%d %H:%M:%S")) target = [targetBase[i] for i in train_index] train = [trainBase[i] for i in train_index] weight = [trainBaseWeight[i] for i in train_index] targetTest = [targetBase[i] for i in test_index] trainTest = [trainBase[i] for i in test_index] weightTest = [trainBaseWeight[i] for i in test_index] #print "LEN: ", len(train), len(target) target = np.array(np.reshape(target, (-1, 1)) ) #train = np.array(np.reshape(train, (-1, 1)) ) weight = np.array(np.reshape(weight, (-1, 1))) targetTest = np.array(np.reshape(targetTest, (-1, 1)) ) #trainTest = np.array(np.reshape(trainTest, (-1, 1)) ) weightTest = np.array(np.reshape(weightTest, (-1, 1))) #clf.fit(train, target, sample_weight = weight clf.fit(train, target) predicted = clf.predict(trainTest) print(str(score.normalized_weighted_gini(targetTest.ravel(), predicted.ravel(), weightTest.ravel()))) avg += score.normalized_weighted_gini(targetTest.ravel(), predicted.ravel(), weightTest.ravel())/NumFolds coef_dataset[:, foldCount] = clf.coef_ foldCount = foldCount + 1 coefs = coef_dataset.mean(1) sorted_coefs = sorted(map(abs, coefs)) # must start by removing coefficients closest to zero. print(coefs) print("len coefs: " + str(len(sorted_coefs))) if len(sorted_coefs) < 5 : break threshold = sorted_coefs[5] print(str(len(columns))) print(trainBase.shape) toDrop = [] # hey, cannot drop var11 and id columns for index in range(len(coefs) - 1, -1, -1): # must reverse columns all shift to lower numbers. if abs(coefs[index]) <= threshold and columns[index] != "var11" and columns[index] != "id":# abs(), remove closest to zero. print("Drop: " + str(index) + " " + columns[index] + " " + str(coefs[index])) #trainBase = np.delete(trainBase,[index], axis=1) toDrop.append(index) #print(columns) if columns[index] in columns: columns.remove(columns[index]) #print(columns) print("start drop") trainBase = np.delete(trainBase,toDrop, axis=1) print("End drop") if avg > avgLast: print("Saving Copy " + str(avgLast) + " " + str(avg)) avgLast = avg columnsHighScore = columns.copy() print("Threshold: " + str(threshold)) print ("------------------------Average: " + str(avg)) print(columnsHighScore) print(str(len(columns))) print(trainBase.shape) featuresRemaining.append(len(columns)) avgScore.append(avg) #break gc.collect() trainBase = pd.read_csv('../preprocessdata/pre_departition_train.csv') trainBase = trainBase.loc[:,columnsHighScore] trainBase.to_csv("../models/" + str(clf)[:5] + "_train.csv", index = False) gc.collect() test = pd.read_csv('../preprocessdata/pre_departition_test.csv') test = test.loc[:,columnsHighScore] test.to_csv("../models/" + str(clf)[:5] + "_test.csv", index = False) print(columnsHighScore) print(featuresRemaining) print(avgScore)
def run_stack(SEED): trainBaseTarget = pd.read_csv('../data/pre_shuffled_target.csv') trainBase = pd.read_csv('../models/Lasso_train.csv') #trainBase = pd.read_csv('../data/pre_shuffled_train.csv') trainBaseWeight = trainBase['var11'] #test = pd.read_csv('../data/pre_shuffled_test.csv') columns = trainBase.columns columnsHighScore = trainBase.columns print(trainBase.columns) trainBase = np.nan_to_num(np.array(trainBase)) targetBase = np.nan_to_num(np.array(trainBaseTarget)) #test = np.nan_to_num(np.array(test)) gc.collect() avg = 0 avgLast = avg bestAvg = avg bestAlpha = 0 NumFolds = 5 print("Data size: " + str(len(trainBase))) print("Begin Training") lenTrainBase = len(trainBase) gc.collect() # best alpha is 0.00069956421567126271 for a in np.logspace( -8, -.5, 30): # best values seem to be slightly greater than 0. #for r in range(1, 10): # r = 0.1 is good #for t in np.logspace(2, 6, 10): # t = 0.0001 is good clf = ElasticNet(alpha=a, l1_ratio=1 / 10, fit_intercept=True, normalize=False, precompute='auto', max_iter=10000, copy_X=True, tol=1 / 10000, warm_start=False, positive=False) print(clf) avg = 0 coef_dataset = np.zeros((len(columns), NumFolds)) foldCount = 0 Folds = cross_validation.KFold(lenTrainBase, n_folds=NumFolds, indices=True) for train_index, test_index in Folds: print() print("Iteration: " + str(foldCount)) now = datetime.datetime.now() print(now.strftime("%Y/%m/%d %H:%M:%S")) target = [targetBase[i] for i in train_index] train = [trainBase[i] for i in train_index] weight = [trainBaseWeight[i] for i in train_index] targetTest = [targetBase[i] for i in test_index] trainTest = [trainBase[i] for i in test_index] weightTest = [trainBaseWeight[i] for i in test_index] #print "LEN: ", len(train), len(target) target = np.array(np.reshape(target, (-1, 1))) #train = np.array(np.reshape(train, (-1, 1)) ) weight = np.array(np.reshape(weight, (-1, 1))) targetTest = np.array(np.reshape(targetTest, (-1, 1))) #trainTest = np.array(np.reshape(trainTest, (-1, 1)) ) weightTest = np.array(np.reshape(weightTest, (-1, 1))) #clf.fit(train, target, sample_weight = weight clf.fit(train, target) predicted = clf.predict(trainTest) print( str( score.normalized_weighted_gini(targetTest.ravel(), predicted.ravel(), weightTest.ravel()))) avg += score.normalized_weighted_gini( targetTest.ravel(), predicted.ravel(), weightTest.ravel()) / NumFolds coef_dataset[:, foldCount] = clf.coef_ foldCount = foldCount + 1 break coefs = coef_dataset.mean(1) #print(coefs) sorted_coefs = sorted(coefs) print("len coefs: " + str(len(sorted_coefs))) coefsAboveZero = [i for i in coefs if i > 0.0] print(str(len(coefsAboveZero))) print("------------------------Average: " + str(avg)) if avg > bestAvg: bestAvg = avg #bestAlpha = a print("bestAvg: " + str(bestAvg)) print("bestAlpha: " + str(bestAlpha))
def run_stack(SEED): trainBaseTarget = pd.read_csv('../preprocessdata/pre_shuffled_target.csv') trainBase = pd.read_csv('../models/Lasso_train.csv') #trainBase = pd.read_csv('../preprocessdata/pre_shuffled_train.csv') trainBaseWeight = trainBase['var11'] #test = pd.read_csv('../data/pre_shuffled_test.csv') columns = trainBase.columns columnsHighScore = trainBase.columns print(trainBase.columns) trainBase = np.nan_to_num(np.array(trainBase)) targetBase = np.nan_to_num(np.array(trainBaseTarget)) #test = np.nan_to_num(np.array(test)) gc.collect() avg = 0 avgLast = avg bestAvg = avg bestAlpha = 0 NumFolds = 5 print ("Data size: " + str(len(trainBase))) print("Begin Training") lenTrainBase = len(trainBase) gc.collect() # best alpha is 0.00040 for a in np.logspace(-6, -.5, 30): # best values seem to be slightly greater than 0. clf = Lasso(alpha=a) print(clf) avg = 0 coef_dataset = np.zeros((len(columns),NumFolds)) foldCount = 0 Folds = cross_validation.KFold(lenTrainBase, n_folds=NumFolds, indices=True) for train_index, test_index in Folds: print() print ("Iteration: " + str(foldCount)) now = datetime.datetime.now() print(now.strftime("%Y/%m/%d %H:%M:%S")) target = [targetBase[i] for i in train_index] train = [trainBase[i] for i in train_index] weight = [trainBaseWeight[i] for i in train_index] targetTest = [targetBase[i] for i in test_index] trainTest = [trainBase[i] for i in test_index] weightTest = [trainBaseWeight[i] for i in test_index] #print "LEN: ", len(train), len(target) target = np.array(np.reshape(target, (-1, 1)) ) #train = np.array(np.reshape(train, (-1, 1)) ) weight = np.array(np.reshape(weight, (-1, 1))) targetTest = np.array(np.reshape(targetTest, (-1, 1)) ) #trainTest = np.array(np.reshape(trainTest, (-1, 1)) ) weightTest = np.array(np.reshape(weightTest, (-1, 1))) #clf.fit(train, target, sample_weight = weight clf.fit(train, target) predicted = clf.predict(trainTest) print(str(score.normalized_weighted_gini(targetTest.ravel(), predicted.ravel(), weightTest.ravel()))) avg += score.normalized_weighted_gini(targetTest.ravel(), predicted.ravel(), weightTest.ravel())/NumFolds coef_dataset[:, foldCount] = clf.coef_ foldCount = foldCount + 1 #break coefs = coef_dataset.mean(1) print(coefs) sorted_coefs = sorted(coefs) print("len coefs: " + str(len(sorted_coefs))) coefsAboveZero = [i for i in coefs if i > 0.0] print(str(len(coefsAboveZero))) print ("------------------------Average: " + str(avg)) if avg > bestAvg: bestAvg = avg bestAlpha = a print("bestAvg: " + str(bestAvg)) print("bestAlpha: " + str(bestAlpha))
def run_stack(SEED): model = "Lasso" lossThreshold = 0.38 trainBaseTarget = pd.read_csv('../preprocessdata/pre_shuffled_target.csv') trainBaseOrig = pd.read_csv('../models/' + model + '_train.csv') trainBaseWeight = trainBaseOrig['var11'] testOrig = pd.read_csv('../models/' + model + '_test.csv') targetBase = np.nan_to_num(np.array(trainBaseTarget)) trainBaseID = trainBaseOrig['id'] testID = testOrig['id'] avg = 0 NumFolds = 5 stackFiles = [] for filename in os.listdir("../predictions"): parts = filename.split("_") if ( filename[0:5] == "Stack" and float(parts[2]) > lossThreshold): stackFiles.append(filename) trainBase = np.zeros((len(trainBaseOrig), len(stackFiles))) test = np.zeros((len(testOrig), len(stackFiles))) print("Loading Data") for fileNum, file in enumerate(stackFiles): print(file) trn = csv_io.read_data("../predictions/Target_" + file, split="," ,skipFirstLine = True) # skip first because of header. for row, datum in enumerate(trn): trainBase[row, fileNum] = datum[1] # -1 because we skil tst = csv_io.read_data("../predictions/" + file, split="," ,skipFirstLine = True) # skip first because of header. for row, datum in enumerate(tst): test[row, fileNum] = datum[1] np.savetxt('temp/dataset_blend_train.txt', trainBase) np.savetxt('temp/dataset_blend_test.txt', test) print("Num file processed: " + " " + str(len(stackFiles)) + " " + "Threshold: " + str(lossThreshold)) print("Starting Scale") allVals = np.vstack((trainBase,test)) scl = StandardScaler(copy=True, with_mean=True, with_std=True) scl.fit(allVals) # should fit on the combined sets. trainBase= scl.transform(trainBase) test = scl.transform(test) print("Starting Blend") clfs = [ #GradientBoostingRegressor(loss='ls', learning_rate=0.05, subsample=0.5, max_depth=10, n_estimators=30, random_state=166, min_samples_leaf=1), Lasso(alpha=0.000016681005372000593), #Ridge(), #LinearRegression(fit_intercept=True, normalize=False, copy_X=True) ] print ("Data size: " + str(len(trainBase)) + " " + str(len(test))) dataset_blend_train = np.zeros((len(trainBase), len(clfs))) dataset_blend_test = np.zeros((len(test), len(clfs))) print("Begin Training") lenTrainBase = len(trainBase) lenTest = len(test) gc.collect() for ExecutionIndex, clf in enumerate(clfs): print(clf) avg = 0 dataset_blend_test_set = np.zeros((lenTest, NumFolds)) foldCount = 0 Folds = cross_validation.KFold(lenTrainBase, n_folds=NumFolds, indices=True) for train_index, test_index in Folds: print() print ("Iteration: " + str(foldCount)) now = datetime.datetime.now() print(now.strftime("%Y/%m/%d %H:%M:%S")) target = [targetBase[i] for i in train_index] train = [trainBase[i] for i in train_index] weight = [trainBaseWeight[i] for i in train_index] targetTest = [targetBase[i] for i in test_index] trainTest = [trainBase[i] for i in test_index] weightTest = [trainBaseWeight[i] for i in test_index] #print "LEN: ", len(train), len(target) target = np.array(np.reshape(target, (-1, 1)) ) #train = np.array(np.reshape(train, (-1, 1)) ) weight = np.array(np.reshape(weight, (-1, 1))) targetTest = np.array(np.reshape(targetTest, (-1, 1)) ) #trainTest = np.array(np.reshape(trainTest, (-1, 1)) ) weightTest = np.array(np.reshape(weightTest, (-1, 1))) #clf.fit(train, target, sample_weight = weight clf.fit(train, target) predicted = clf.predict(trainTest) #print(predicted[:,0]) print(predicted) dataset_blend_train[test_index, ExecutionIndex] = predicted#[:,0] #needed for Ridge #print(targetTest.shape) #print(prpredictedob.shape) #print(weightTest.shape) print(str(score.normalized_weighted_gini(targetTest.ravel(), predicted.ravel(), weightTest.ravel()))) avg += score.normalized_weighted_gini(targetTest.ravel(), predicted.ravel(), weightTest.ravel())/NumFolds #print(str(score.normalized_weighted_gini(targetTest.ravel(), predicted[:,0], weightTest.ravel()))) #avg += score.normalized_weighted_gini(targetTest.ravel(), predicted[:,0], weightTest.ravel())/NumFolds predicted = clf.predict(test) dataset_blend_test_set[:, foldCount] = predicted#[:,0] foldCount = foldCount + 1 #break dataset_blend_test[:,ExecutionIndex] = dataset_blend_test_set.mean(1) now = datetime.datetime.now() #print dataset_blend_test_set.mean(1) #csv_io.write_delimited_file_single("../predictions/Stack_" + now.strftime("%Y%m%d%H%M%S") + "_" + str(avg) + "_" + str(clf)[:12] + ".csv", dataset_blend_test_set.mean(1)) submission = pd.DataFrame(np.zeros((len(testID), 2)), columns=['id', 'target']) submission['target'] = dataset_blend_test[:,ExecutionIndex] submission['id'] = testID submission.to_csv("../submission/Blend_" + now.strftime("%Y%m%d%H%M%S") + "_" + str(avg) + "_" + str(clf)[:12] + ".csv", index = False) #csv_io.write_delimited_file_single("../predictions/Target_Stack_" + now.strftime("%Y%m%d%H%M%S") + "_" + str(avg) + "_" + str(clf)[:12] + ".csv", dataset_blend_train[:,ExecutionIndex] ) submission = pd.DataFrame(np.zeros((len(trainBaseID), 2)), columns=['id', 'target']) submission['target'] = dataset_blend_train[:,ExecutionIndex] submission['id'] = trainBaseID submission.to_csv("../submission/Target_Blend_" + now.strftime("%Y%m%d%H%M%S") + "_" + str(avg) + "_" + str(clf)[:12] + ".csv", index = False) csv_io.write_delimited_file("../log/RunLogBlend.csv", [now.strftime("%Y %m %d %H %M %S"), "AVG." , str(avg), str(clf), "Folds:", str(NumFolds), "Model", "Blend", "Stacks: ", stackFiles], filemode="a",delimiter=",") print ("------------------------Average: " + str(avg)) #np.savetxt('temp/dataset_blend_train.txt', dataset_blend_train) return dataset_blend_train, dataset_blend_test
def run_pre_single_feature(SEED): trainBaseTarget = pd.read_csv('../data/pre_shuffled_target.csv') trainBase = pd.read_csv('../data/pre_shuffled_train.csv') trainBaseWeight = trainBase['var11'] #test = pd.read_csv('../data/pre_shuffled_test.csv') columns = trainBase.columns print(trainBase.columns) trainBaseID = trainBase['id'] trainBase = np.nan_to_num(np.array(trainBase)) targetBase = np.nan_to_num(np.array(trainBaseTarget)) avg = 0 NumFolds = 5 predicted_list = [] bootstrapLists = [] #print ("Data size: " + str(len(trainBase)) + " " + str(len(test))) trainNew = [] trainTestNew = [] testNew = [] trainNewSelect = [] trainTestNewSelect = [] testNewSelect = [] print("Begin Training") lenTrainBase = len(trainBase) #lenTest = len(test) gc.collect() columnScores = {} columnCount = 0 clfs = [ #Ridge(), #RandomForestRegressor(n_estimators=30, criterion='mse', max_depth=None, min_samples_split=2, min_samples_leaf=1, max_features='auto', bootstrap=True, oob_score=False, n_jobs=1, random_state=None, verbose=0, min_density=None, compute_importances=False) , #GradientBoostingRegressor(loss='ls', learning_rate=0.1, n_estimators=100, subsample=1.0, min_samples_split=2, min_samples_leaf=1, max_depth=3, init=None, random_state=None, max_features=None, alpha=0.9, verbose=0), #AdaBoostRegressor(base_estimator=None, n_estimators=50, learning_rate=1.0, loss='linear', random_state=None), SVR(kernel='rbf', degree=3, gamma=0.0, coef0=0.0, tol=0.001, C=1.0, epsilon=0.1, shrinking=True, probability=False, cache_size=200, verbose=False, max_iter=-1, random_state=None), # floating point over/under flow SGDRegressor(loss='squared_loss', penalty='l2', alpha=0.0001, l1_ratio=0.15, fit_intercept=True, n_iter=5, shuffle=False, verbose=0, epsilon=0.1, random_state=None, learning_rate='invscaling', eta0=0.01, power_t=0.25, warm_start=False, rho=None), #BayesianRidge(n_iter=300, tol=0.001, alpha_1=1e-06, alpha_2=1e-06, lambda_1=1e-06, lambda_2=1e-06, compute_score=False, fit_intercept=True, normalize=False, copy_X=True, verbose=False), ] for ExecutionIndex, clf in enumerate(clfs): print(clf) for column in range(0,trainBase.shape[1]): avg = 0 foldCount = 0 Folds = cross_validation.KFold(lenTrainBase, n_folds=NumFolds, indices=True) for train_index, test_index in Folds: target = [targetBase[i] for i in train_index] train = [trainBase[i,column] for i in train_index] weight = [trainBaseWeight[i] for i in train_index] targetTest = [targetBase[i] for i in test_index] trainTest = [trainBase[i,column] for i in test_index] weightTest = [trainBaseWeight[i] for i in test_index] #print() #print ("Iteration: " + str(foldCount)) #print "LEN: ", len(train), len(target) target = np.array(np.reshape(target, (-1, 1)) ) train = np.array(np.reshape(train, (-1, 1)) ) weight = np.array(np.reshape(weight, (-1, 1))) targetTest = np.array(np.reshape(targetTest, (-1, 1)) ) trainTest = np.array(np.reshape(trainTest, (-1, 1)) ) weightTest = np.array(np.reshape(weightTest, (-1, 1))) #print(target.shape) clf.fit(train, target) prob = clf.predict(trainTest) #print(targetTest) #print(prob) #print(weightTest) #print(targetTest.shape) #print(prob.shape) #print(weightTest.shape) print(str(score.normalized_weighted_gini(targetTest.ravel(), prob.ravel(), weightTest.ravel()))) avg += score.normalized_weighted_gini(targetTest.ravel(), prob.ravel(), weightTest.ravel())/NumFolds foldCount = foldCount + 1 print (str(columns[column]) + " Average Score: " + str(avg)) columnScores[str(columns[column])] = avg columnCount = columnCount + 1 if columnCount > 2: break submission = pd.Series(columnScores) submission.to_csv("../featureanalysis/single_feaure_" + str(clf)[:5] + ".csv")
def run_stack(SEED): model = "Lasso" lossThreshold = 0.3 trainBaseTarget = pd.read_csv('../preprocessdata/pre_shuffled_target.csv') trainBaseOrig = pd.read_csv('../models/' + model + '_train.csv') trainBaseWeight = trainBaseOrig['var11'] testOrig = pd.read_csv('../models/' + model + '_test.csv') targetBase = np.nan_to_num(np.array(trainBaseTarget)) trainBaseID = trainBaseOrig['id'] testID = testOrig['id'] avg = 0 NumFolds = 5 avgLast = avg bestAvg = avg bestAlpha = 0 stackFiles = [] for filename in os.listdir("../predictions"): parts = filename.split("_") if (filename[0:5] == "Stack" and float(parts[2]) > lossThreshold): stackFiles.append(filename) trainBase = np.zeros((len(trainBaseOrig), len(stackFiles))) test = np.zeros((len(testOrig), len(stackFiles))) print("Loading Data") for fileNum, file in enumerate(stackFiles): print(file) trn = csv_io.read_data( "../predictions/Target_" + file, split=",", skipFirstLine=True) # skip first because of header. for row, datum in enumerate(trn): trainBase[row, fileNum] = datum[1] # -1 because we skil tst = csv_io.read_data( "../predictions/" + file, split=",", skipFirstLine=True) # skip first because of header. for row, datum in enumerate(tst): test[row, fileNum] = datum[1] np.savetxt('temp/dataset_blend_train.txt', trainBase) np.savetxt('temp/dataset_blend_test.txt', test) print("Num file processed: " + " " + str(len(stackFiles)) + " " + "Threshold: " + str(lossThreshold)) print("Starting Scale") allVals = np.vstack((trainBase, test)) scl = StandardScaler(copy=True, with_mean=True, with_std=True) scl.fit(allVals) # should fit on the combined sets. trainBase = scl.transform(trainBase) test = scl.transform(test) print("Starting Blend") print("Begin Training") lenTrainBase = len(trainBase) lenTest = len(test) gc.collect() for a in np.logspace( -6, -.5, 10): # best values seem to be slightly greater than 0. clf = Lasso(alpha=a) print(clf) avg = 0 coef_dataset = np.zeros((len(stackFiles), NumFolds)) foldCount = 0 Folds = cross_validation.KFold(lenTrainBase, n_folds=NumFolds, indices=True) for train_index, test_index in Folds: print() print("Iteration: " + str(foldCount)) now = datetime.datetime.now() print(now.strftime("%Y/%m/%d %H:%M:%S")) target = [targetBase[i] for i in train_index] train = [trainBase[i] for i in train_index] weight = [trainBaseWeight[i] for i in train_index] targetTest = [targetBase[i] for i in test_index] trainTest = [trainBase[i] for i in test_index] weightTest = [trainBaseWeight[i] for i in test_index] #print "LEN: ", len(train), len(target) target = np.array(np.reshape(target, (-1, 1))) #train = np.array(np.reshape(train, (-1, 1)) ) weight = np.array(np.reshape(weight, (-1, 1))) targetTest = np.array(np.reshape(targetTest, (-1, 1))) #trainTest = np.array(np.reshape(trainTest, (-1, 1)) ) weightTest = np.array(np.reshape(weightTest, (-1, 1))) clf.fit(train, target) predicted = clf.predict(trainTest) print( str( score.normalized_weighted_gini(targetTest.ravel(), predicted.ravel(), weightTest.ravel()))) avg += score.normalized_weighted_gini( targetTest.ravel(), predicted.ravel(), weightTest.ravel()) / NumFolds coef_dataset[:, foldCount] = clf.coef_ foldCount = foldCount + 1 #break coefs = coef_dataset.mean(1) print(coefs) sorted_coefs = sorted(coefs) print("len coefs: " + str(len(sorted_coefs))) coefsAboveZero = [i for i in coefs if i > 0.0] print(str(len(coefsAboveZero))) print("------------------------Average: " + str(avg)) if avg > bestAvg: bestAvg = avg bestAlpha = a print("bestAvg: " + str(bestAvg)) print("bestAlpha: " + str(bestAlpha))
def run_stack(SEED): model = "Lasso" lossThreshold = 0.3 trainBaseTarget = pd.read_csv('../preprocessdata/pre_shuffled_target.csv') trainBaseOrig = pd.read_csv('../models/' + model + '_train.csv') trainBaseWeight = trainBaseOrig['var11'] testOrig = pd.read_csv('../models/' + model + '_test.csv') targetBase = np.nan_to_num(np.array(trainBaseTarget)) trainBaseID = trainBaseOrig['id'] testID = testOrig['id'] avg = 0 NumFolds = 5 avgLast = avg bestAvg = avg bestAlpha = 0 stackFiles = [] for filename in os.listdir("../predictions"): parts = filename.split("_") if ( filename[0:5] == "Stack" and float(parts[2]) > lossThreshold): stackFiles.append(filename) trainBase = np.zeros((len(trainBaseOrig), len(stackFiles))) test = np.zeros((len(testOrig), len(stackFiles))) print("Loading Data") for fileNum, file in enumerate(stackFiles): print(file) trn = csv_io.read_data("../predictions/Target_" + file, split="," ,skipFirstLine = True) # skip first because of header. for row, datum in enumerate(trn): trainBase[row, fileNum] = datum[1] # -1 because we skil tst = csv_io.read_data("../predictions/" + file, split="," ,skipFirstLine = True) # skip first because of header. for row, datum in enumerate(tst): test[row, fileNum] = datum[1] np.savetxt('temp/dataset_blend_train.txt', trainBase) np.savetxt('temp/dataset_blend_test.txt', test) print("Num file processed: " + " " + str(len(stackFiles)) + " " + "Threshold: " + str(lossThreshold)) print("Starting Scale") allVals = np.vstack((trainBase,test)) scl = StandardScaler(copy=True, with_mean=True, with_std=True) scl.fit(allVals) # should fit on the combined sets. trainBase= scl.transform(trainBase) test = scl.transform(test) print("Starting Blend") print("Begin Training") lenTrainBase = len(trainBase) lenTest = len(test) gc.collect() for a in np.logspace(-6, -.5, 10): # best values seem to be slightly greater than 0. clf = Lasso(alpha=a) print(clf) avg = 0 coef_dataset = np.zeros((len(stackFiles),NumFolds)) foldCount = 0 Folds = cross_validation.KFold(lenTrainBase, n_folds=NumFolds, indices=True) for train_index, test_index in Folds: print() print ("Iteration: " + str(foldCount)) now = datetime.datetime.now() print(now.strftime("%Y/%m/%d %H:%M:%S")) target = [targetBase[i] for i in train_index] train = [trainBase[i] for i in train_index] weight = [trainBaseWeight[i] for i in train_index] targetTest = [targetBase[i] for i in test_index] trainTest = [trainBase[i] for i in test_index] weightTest = [trainBaseWeight[i] for i in test_index] #print "LEN: ", len(train), len(target) target = np.array(np.reshape(target, (-1, 1)) ) #train = np.array(np.reshape(train, (-1, 1)) ) weight = np.array(np.reshape(weight, (-1, 1))) targetTest = np.array(np.reshape(targetTest, (-1, 1)) ) #trainTest = np.array(np.reshape(trainTest, (-1, 1)) ) weightTest = np.array(np.reshape(weightTest, (-1, 1))) clf.fit(train, target) predicted = clf.predict(trainTest) print(str(score.normalized_weighted_gini(targetTest.ravel(), predicted.ravel(), weightTest.ravel()))) avg += score.normalized_weighted_gini(targetTest.ravel(), predicted.ravel(), weightTest.ravel())/NumFolds coef_dataset[:, foldCount] = clf.coef_ foldCount = foldCount + 1 #break coefs = coef_dataset.mean(1) print(coefs) sorted_coefs = sorted(coefs) print("len coefs: " + str(len(sorted_coefs))) coefsAboveZero = [i for i in coefs if i > 0.0] print(str(len(coefsAboveZero))) print ("------------------------Average: " + str(avg)) if avg > bestAvg: bestAvg = avg bestAlpha = a print("bestAvg: " + str(bestAvg)) print("bestAlpha: " + str(bestAlpha))
def run_stack(SEED): trainBaseTarget = pd.read_csv('../data/pre_shuffled_target.csv') trainBase = pd.read_csv('../models/Lasso_train.csv') trainBaseWeight = trainBase['var11'] #test = pd.read_csv('../data/pre_shuffled_test.csv') columns = trainBase.columns columnsHighScore = trainBase.columns print(trainBase.columns) trainBase = np.nan_to_num(np.array(trainBase)) targetBase = np.nan_to_num(np.array(trainBaseTarget)) #test = np.nan_to_num(np.array(test)) gc.collect() avg = 0 avgLast = avg NumFolds = 5 print("Data size: " + str(len(trainBase))) print("Begin Training") lenTrainBase = len(trainBase) gc.collect() CC = [6, 5, 7, 4, 8, 3, 9, 2, 10, 1] GG = [-6, -7, -5, -8, -4, -9, -3, -10, -2, -1] for c in CC: for g in GG: clf = SVR(kernel='rbf', degree=3, gamma=10**g, coef0=0.0, tol=0.001, C=10**c, epsilon=0.1, shrinking=True, probability=False, cache_size=200, verbose=False, max_iter=-1, random_state=None) print(clf) print(str(c) + " " + str(g)) avg = 0 coef_dataset = np.zeros((len(columns), NumFolds)) foldCount = 0 Folds = cross_validation.KFold(lenTrainBase, n_folds=NumFolds, indices=True) for train_index, test_index in Folds: print() print("Iteration: " + str(foldCount)) now = datetime.datetime.now() print(now.strftime("%Y/%m/%d %H:%M:%S")) target = [targetBase[i] for i in train_index] train = [trainBase[i] for i in train_index] weight = [trainBaseWeight[i] for i in train_index] targetTest = [targetBase[i] for i in test_index] trainTest = [trainBase[i] for i in test_index] weightTest = [trainBaseWeight[i] for i in test_index] #print "LEN: ", len(train), len(target) target = np.array(np.reshape(target, (-1, 1))) #train = np.array(np.reshape(train, (-1, 1)) ) weight = np.array(np.reshape(weight, (-1, 1))) targetTest = np.array(np.reshape(targetTest, (-1, 1))) #trainTest = np.array(np.reshape(trainTest, (-1, 1)) ) weightTest = np.array(np.reshape(weightTest, (-1, 1))) #clf.fit(train, target, sample_weight = weight clf.fit(train, target.ravel()) predicted = clf.predict(trainTest) print( str( score.normalized_weighted_gini(targetTest.ravel(), predicted.ravel(), weightTest.ravel()))) avg += score.normalized_weighted_gini( targetTest.ravel(), predicted.ravel(), weightTest.ravel()) / NumFolds coef_dataset[:, foldCount] = clf.coef_ foldCount = foldCount + 1 break coefs = coef_dataset.mean(1) print(coefs) sorted_coefs = sorted(coefs) print("len coefs: " + str(len(sorted_coefs))) print("------------------------Average: " + str(avg))
def run_stack(SEED): trainBaseTarget = pd.read_csv('../preprocessdata/pre_shuffled_target.csv') trainBase = pd.read_csv('../models/Lasso_train.csv') #trainBase = pd.read_csv('../preprocessdata/pre_shuffled_train.csv') trainBaseWeight = trainBase['var11'] #test = pd.read_csv('../data/pre_shuffled_test.csv') columns = trainBase.columns columnsHighScore = trainBase.columns print(trainBase.columns) trainBase = np.nan_to_num(np.array(trainBase)) targetBase = np.nan_to_num(np.array(trainBaseTarget)) #test = np.nan_to_num(np.array(test)) gc.collect() avg = 0 avgLast = avg bestAvg = avg bestAlpha = 0 NumFolds = 5 print("Data size: " + str(len(trainBase))) print("Begin Training") lenTrainBase = len(trainBase) gc.collect() # best alpha is 0.00040 for a in np.logspace( -6, -.5, 30): # best values seem to be slightly greater than 0. clf = Lasso(alpha=a) print(clf) avg = 0 coef_dataset = np.zeros((len(columns), NumFolds)) foldCount = 0 Folds = cross_validation.KFold(lenTrainBase, n_folds=NumFolds, indices=True) for train_index, test_index in Folds: print() print("Iteration: " + str(foldCount)) now = datetime.datetime.now() print(now.strftime("%Y/%m/%d %H:%M:%S")) target = [targetBase[i] for i in train_index] train = [trainBase[i] for i in train_index] weight = [trainBaseWeight[i] for i in train_index] targetTest = [targetBase[i] for i in test_index] trainTest = [trainBase[i] for i in test_index] weightTest = [trainBaseWeight[i] for i in test_index] #print "LEN: ", len(train), len(target) target = np.array(np.reshape(target, (-1, 1))) #train = np.array(np.reshape(train, (-1, 1)) ) weight = np.array(np.reshape(weight, (-1, 1))) targetTest = np.array(np.reshape(targetTest, (-1, 1))) #trainTest = np.array(np.reshape(trainTest, (-1, 1)) ) weightTest = np.array(np.reshape(weightTest, (-1, 1))) #clf.fit(train, target, sample_weight = weight clf.fit(train, target) predicted = clf.predict(trainTest) print( str( score.normalized_weighted_gini(targetTest.ravel(), predicted.ravel(), weightTest.ravel()))) avg += score.normalized_weighted_gini( targetTest.ravel(), predicted.ravel(), weightTest.ravel()) / NumFolds coef_dataset[:, foldCount] = clf.coef_ foldCount = foldCount + 1 #break coefs = coef_dataset.mean(1) print(coefs) sorted_coefs = sorted(coefs) print("len coefs: " + str(len(sorted_coefs))) coefsAboveZero = [i for i in coefs if i > 0.0] print(str(len(coefsAboveZero))) print("------------------------Average: " + str(avg)) if avg > bestAvg: bestAvg = avg bestAlpha = a print("bestAvg: " + str(bestAvg)) print("bestAlpha: " + str(bestAlpha))
def run_stack(SEED): trainBaseTarget = pd.read_csv('../data/pre_shuffled_target.csv') trainBase = pd.read_csv('../models/Lasso_train.csv') #trainBase = pd.read_csv('../data/pre_shuffled_train.csv') trainBaseWeight = trainBase['var11'] #test = pd.read_csv('../data/pre_shuffled_test.csv') columns = trainBase.columns columnsHighScore = trainBase.columns print(trainBase.columns) trainBase = np.nan_to_num(np.array(trainBase)) targetBase = np.nan_to_num(np.array(trainBaseTarget)) #test = np.nan_to_num(np.array(test)) gc.collect() avg = 0 avgLast = avg bestAvg = avg bestAlpha = 0 NumFolds = 5 print ("Data size: " + str(len(trainBase))) print("Begin Training") lenTrainBase = len(trainBase) gc.collect() # best alpha is 0.00069956421567126271 for a in np.logspace(-8, -.5, 30): # best values seem to be slightly greater than 0. #for r in range(1, 10): # r = 0.1 is good #for t in np.logspace(2, 6, 10): # t = 0.0001 is good clf = ElasticNet(alpha=a, l1_ratio=1/10, fit_intercept=True, normalize=False, precompute='auto', max_iter=10000, copy_X=True, tol=1/10000, warm_start=False, positive=False) print(clf) avg = 0 coef_dataset = np.zeros((len(columns),NumFolds)) foldCount = 0 Folds = cross_validation.KFold(lenTrainBase, n_folds=NumFolds, indices=True) for train_index, test_index in Folds: print() print ("Iteration: " + str(foldCount)) now = datetime.datetime.now() print(now.strftime("%Y/%m/%d %H:%M:%S")) target = [targetBase[i] for i in train_index] train = [trainBase[i] for i in train_index] weight = [trainBaseWeight[i] for i in train_index] targetTest = [targetBase[i] for i in test_index] trainTest = [trainBase[i] for i in test_index] weightTest = [trainBaseWeight[i] for i in test_index] #print "LEN: ", len(train), len(target) target = np.array(np.reshape(target, (-1, 1)) ) #train = np.array(np.reshape(train, (-1, 1)) ) weight = np.array(np.reshape(weight, (-1, 1))) targetTest = np.array(np.reshape(targetTest, (-1, 1)) ) #trainTest = np.array(np.reshape(trainTest, (-1, 1)) ) weightTest = np.array(np.reshape(weightTest, (-1, 1))) #clf.fit(train, target, sample_weight = weight clf.fit(train, target) predicted = clf.predict(trainTest) print(str(score.normalized_weighted_gini(targetTest.ravel(), predicted.ravel(), weightTest.ravel()))) avg += score.normalized_weighted_gini(targetTest.ravel(), predicted.ravel(), weightTest.ravel())/NumFolds coef_dataset[:, foldCount] = clf.coef_ foldCount = foldCount + 1 break coefs = coef_dataset.mean(1) #print(coefs) sorted_coefs = sorted(coefs) print("len coefs: " + str(len(sorted_coefs))) coefsAboveZero = [i for i in coefs if i > 0.0] print(str(len(coefsAboveZero))) print ("------------------------Average: " + str(avg)) if avg > bestAvg: bestAvg = avg #bestAlpha = a print("bestAvg: " + str(bestAvg)) print("bestAlpha: " + str(bestAlpha))
def run_stack(SEED): trainBaseTarget = pd.read_csv('../data/pre_shuffled_target.csv') trainBase = pd.read_csv('../models/Lasso_train.csv') trainBaseWeight = trainBase['var11'] #test = pd.read_csv('../data/pre_shuffled_test.csv') columns = trainBase.columns columnsHighScore = trainBase.columns print(trainBase.columns) trainBase = np.nan_to_num(np.array(trainBase)) targetBase = np.nan_to_num(np.array(trainBaseTarget)) #test = np.nan_to_num(np.array(test)) gc.collect() avg = 0 avgLast = avg NumFolds = 5 print ("Data size: " + str(len(trainBase))) print("Begin Training") lenTrainBase = len(trainBase) gc.collect() CC = [6,5,7,4,8,3,9,2,10,1] GG = [-6,-7,-5,-8,-4,-9,-3,-10,-2,-1] for c in CC: for g in GG: clf = SVR(kernel='rbf', degree=3, gamma=10**g, coef0=0.0, tol=0.001, C=10**c, epsilon=0.1, shrinking=True, probability=False, cache_size=200, verbose=False, max_iter=-1, random_state=None) print(clf) print(str(c) + " " + str(g)) avg = 0 coef_dataset = np.zeros((len(columns),NumFolds)) foldCount = 0 Folds = cross_validation.KFold(lenTrainBase, n_folds=NumFolds, indices=True) for train_index, test_index in Folds: print() print ("Iteration: " + str(foldCount)) now = datetime.datetime.now() print(now.strftime("%Y/%m/%d %H:%M:%S")) target = [targetBase[i] for i in train_index] train = [trainBase[i] for i in train_index] weight = [trainBaseWeight[i] for i in train_index] targetTest = [targetBase[i] for i in test_index] trainTest = [trainBase[i] for i in test_index] weightTest = [trainBaseWeight[i] for i in test_index] #print "LEN: ", len(train), len(target) target = np.array(np.reshape(target, (-1, 1)) ) #train = np.array(np.reshape(train, (-1, 1)) ) weight = np.array(np.reshape(weight, (-1, 1))) targetTest = np.array(np.reshape(targetTest, (-1, 1)) ) #trainTest = np.array(np.reshape(trainTest, (-1, 1)) ) weightTest = np.array(np.reshape(weightTest, (-1, 1))) #clf.fit(train, target, sample_weight = weight clf.fit(train, target.ravel()) predicted = clf.predict(trainTest) print(str(score.normalized_weighted_gini(targetTest.ravel(), predicted.ravel(), weightTest.ravel()))) avg += score.normalized_weighted_gini(targetTest.ravel(), predicted.ravel(), weightTest.ravel())/NumFolds coef_dataset[:, foldCount] = clf.coef_ foldCount = foldCount + 1 break coefs = coef_dataset.mean(1) print(coefs) sorted_coefs = sorted(coefs) print("len coefs: " + str(len(sorted_coefs))) print ("------------------------Average: " + str(avg))