doTestFlag = False path = _basePath + expInfo + "_train_tobe.csv" testPath = _basePath + expInfo + "_test_tobe.csv" # 1. read data dr = DataReader() dr.readInCSV(path, "train") newX, newY = dr._trainDataFrame, dr._ansDataFrame if doTestFlag == True: dr.readInCSV(testPath, "test") newX = dr._testDataFrame #newX = pd.DataFrame(newX[newX.columns[0]]) #print newX # 3. get all best model from newX fab = ModelFactory() fab._gridSearchFlag = True fab._n_iter_search = 30 fab._expInfo = expInfo fab.getAllModels(newX, newY) # 4. test all data, output 3 ans as features #D:\Kaggle\Telstra\002_blending\(Xgboost)_(2016-02-03_20_09_03).model #D:\Kaggle\Telstra\002_blending\(Random_Forest)_(2016-02-03_20_16_16).model #D:\Kaggle\Telstra\002_blending\(Extra_Trees)_(2016-02-03_20_21_58).model #D:\Kaggle\Telstra\002_blending\(K_NN)_(2016-02-03_20_32_22).model # modelPath = _basePath+"(K_NN)_(2016-02-03_20_32_22).model" # tmpOutPath = _basePath + "002_submission_1_K_NN.csv" # tmpClf = loadModel( modelPath) # log(tmpClf.predict_proba(newX))
tmpCurFeatureList.append(featureList[j]) log(tmpCurFeatureList) newX = pd.DataFrame() for tmpFeature in tmpCurFeatureList: path = _basePath + tmpFeature + "_train.csv" dr = DataReader() tmpX = dr.cvtPathListToDfList(path, "test") newX = pd.concat([newX, tmpX], axis=1) #log("feature len: " , len(newX)) # Get all best model from newX fab = ModelFactory() fab._setXgboostTheradToOne = True fab._gridSearchFlag = True fab._onlyTreeBasedModels = True fab._subFolderName = "one_hot_each_" + str(i) fab._n_iter_search = 30 fab._expInfo = expInfo # fab.getAllModels(newX, newY) fab.getRandomForestClf(newX, newY) # fab.getXgboostClf(newX, newY) log ( i , "/32 done..." )
path = _basePath + expNo + "_train_tobe.csv" testPath = _basePath + expNo + "_test_tobe.csv" # 1. read data dr = DataReader() dr.readInCSV( path, "train") newX, newY = dr._trainDataFrame, dr._ansDataFrame if doTestFlag == True: dr.readInCSV(testPath , "test") newX = dr._testDataFrame #newX = pd.DataFrame(newX[newX.columns[0]]) #print newX # 3. get all best model from newX fab = ModelFactory() fab._gridSearchFlag = True fab._n_iter_search = 30 fab._expInfo = expInfo fab.getAllModels(newX, newY) # 4. test all data, output 3 ans as features #D:\Kaggle\Telstra\002_blending\(Xgboost)_(2016-02-03_20_09_03).model #D:\Kaggle\Telstra\002_blending\(Random_Forest)_(2016-02-03_20_16_16).model #D:\Kaggle\Telstra\002_blending\(Extra_Trees)_(2016-02-03_20_21_58).model #D:\Kaggle\Telstra\002_blending\(K_NN)_(2016-02-03_20_32_22).model # modelPath = _basePath+"(K_NN)_(2016-02-03_20_32_22).model" # tmpOutPath = _basePath + "002_submission_1_K_NN.csv" # tmpClf = loadModel( modelPath) # log(tmpClf.predict_proba(newX))
def exp(): expInfo = "location_only\\" _basePath = Config.FolderBasePath + expInfo doTestFlag = False path = _basePath + "train.csv" testPath = _basePath + "test10.csv" # 1. read data dr = DataReader() dr.readInCSV(path, "train") if doTestFlag == True: dr.readInCSV(testPath, "test") # 2. run models #print dr._trainDataFrame.as_matrix fab = ModelFactory() fab._gridSearchFlag = True fab._n_iter_search = 10 fab._expInfo = "location_only" X = dr._trainDataFrame Y = dr._ansDataFrame #fab.getRandomForestClf(X, Y) #fab.getAllModels(dr._trainDataFrame, dr._ansDataFrame) # log( "xgb start") # param = {'max_depth':10, 'n_estimators':300 , 'num_class':3, 'learning_rate':0.05, 'objective':'multi:softprob'} # num_round = 5 #gbm = xgb.XGBClassifier(max_depth=10, n_estimators=300, learning_rate=0.05, objective='multi:softprob').fit(dr._trainDataFrame, dr._ansDataFrame) #testResult = gbm.predict_proba(dr._testDataFrame) #print testResult # gbm = xgb.XGBClassifier(max_depth=10, n_estimators=300, learning_rate=0.05, objective='multi:softprob') # scores = cross_val_score(rfClf, dr._trainDataFrame, dr._ansDataFrame, n_jobs = -1) # log( "xgboost Validation Precision: ", scores.mean() ) #xgbCv = xgb.cv(param, xgb.DMatrix(dr._trainDataFrame, dr._ansDataFrame), num_round, nfold=5,metrics={'error'}, seed = 0) #gbTrain = gbm.fit(dr._trainDataFrame, dr._ansDataFrame) #joblib.dump(gbTrain, xgbModelPath) #clf = joblib.load( xgbModelPath ) #clf.predict_proba(dr._testDataFrame) #xgb.save(gbm, xgbModelPath) #print xgbCv #print "xgb end" #gbm = joblib.load( xgbModelPath ) #finalClf = gbm if doTestFlag == True: print finalClf.predict_proba(dr._testDataFrame) # featureImportance =[] # for i in range(0,len(finalClf.feature_importances_)): # if i != len(dr._trainDataFrame.columns): # if (dr._trainDataFrame.columns[i]).find("_one_hot") == -1: # featureImportance.append( [dr._trainDataFrame.columns[i] , finalClf.feature_importances_[i]] ) # # print featureImportance # featureImportance.sort(lambda x, y: cmp(x[1], y[1]), reverse=True) # print featureImportance if doTestFlag == True: return finalClf.predict_proba(dr._testDataFrame)
featureList = ["location", "event_type", "resource_type" , "severity_type", "log_feature"] ans1List = [] ans2List = [] # ansPath = _basePath + "014_ans_array.csv" # drAns = DataReader() # drAns.readInCSV(ansPath, "train") # newY = drAns._ansDataFrame tmpPath = _basePath + "train_merge_one_hot.csv" dr = DataReader() dr.readInCSV(tmpPath, "train") newX = dr._trainDataFrame newY = dr._ansDataFrame fab = ModelFactory() #fab._setXgboostTheradToOne = True fab._gridSearchFlag = True fab._singleModelMail = True fab._subFolderName = "groupby_sum" fab._n_iter_search = 1 fab._expInfo = expInfo clf = fab.getXgboostClf(newX, newY) # tmpPath = _basePath + "test_merge_one_hot" + ".csv" dr = DataReader() dr.readInCSV(tmpPath, "test") newX = dr._testDataFrame newX = xgb.DMatrix(newX) tmpOutPath = _basePath + expNo +"_" + "Xgboost_" + "groupby_sum"+ "_ans.csv" log(clf.predict(newX))
if __name__ == "__main__": # 1. read in data expNo = "012" expInfo = expNo + "_rf_chk_important" _basePath = Config.FolderBasePath + expInfo + Config.osSep path = _basePath + expNo + "_train_asis.csv" testPath = _basePath + expNo + "_test_asis.csv" dr = DataReader() dr.readInCSV(path, "train") newX, newY = dr._trainDataFrame, dr._ansDataFrame # Get all best model from newX fab = ModelFactory() fab._gridSearchFlag = True # fab._subFolderName = "stacked" fab._n_iter_search = 250 fab._expInfo = expInfo # fab.getAllModels(newX, newY) finalClf = fab.getRandomForestClf(newX, newY) featureImportance = [] for i in range(0, len(finalClf.feature_importances_)): if i != len(dr._trainDataFrame.columns): # if (dr._trainDataFrame.columns[i]).find("_one_hot") == -1: featureImportance.append([dr._trainDataFrame.columns[i], finalClf.feature_importances_[i]]) # log( featureImportance) featureImportance.sort(lambda x, y: cmp(x[1], y[1]), reverse=True)
# # for i in range(0,3): # tmpIdx = ans2List[random.randint(0, len(ans2List)-1)] # tmpAnsCol = pd.DataFrame() # tmpAnsCol[0] = 2 # newX = newX.append(newX.iloc()[tmpIdx]) # newX[newX.columns[0]][len(newX)-1] = 2 # print i # # #print newX.iloc()[0] # tmpOutPath = _basePath + "location_log_feature_over_sampling.csv" # print len(newX) # newX.to_csv(tmpOutPath, sep=',', encoding='utf-8') # #print len(newX) # Get all best model from newX fab = ModelFactory() fab._setXgboostTheradToOne = True fab._gridSearchFlag = True fab._singleModelMail = True fab._subFolderName = "ismail4" fab._n_iter_search = 10 fab._expInfo = expInfo # fab.getAllModels(newX, newY) #fab.getRandomForestClf(newX, newY) fab.getXgboostClf(newX, newY) # fab.getXgboostClf(newX, newY) # log ( i , "/32 done..." )
dr2.readInCSV(tmpPath, "test") testX = dr2._testDataFrame ori_testX = testX #sampleRows = np.random.choice(testX.index, len(testX)*evalDataPercentage) sampleRows = [] for i in range(0, int(len(testX)/2)): sampleRows.append(i) test_fold_2 = testX.ix[sampleRows] test_fold_1 = testX.drop(sampleRows) clfList = ["xgboost", "rf","extra_tree",] fab = ModelFactory() fab._gridSearchFlag = True dfUpper = pd.DataFrame() dfTestUpper = pd.DataFrame() eachClfLoopTimes = 3 iter_search_times = 1 for tmpClfName in clfList: for i in range(0,eachClfLoopTimes): fab._subFolderName = tmpClfName fab._n_iter_search = iter_search_times fab._expInfo = expInfo if tmpClfName == "rf": clf = fab.getRandomForestClf(train_fold_1, train_fold_label_1) elif tmpClfName == "knn":
if __name__ == '__main__': # 1. read in data expNo = "013" expInfo = expNo + "_data_exploration" _basePath = Config.FolderBasePath + expInfo + Config.osSep path = _basePath + expNo + "_train_tobe.csv" testPath = _basePath + expNo + "_test_asis.csv" dr = DataReader() dr.readInCSV(path, "train") newX, newY = dr._trainDataFrame, dr._ansDataFrame # Get all best model from newX fab = ModelFactory() fab._gridSearchFlag = True fab._subFolderName = "binary" fab._n_iter_search = 50 fab._expInfo = expInfo fab.getAllModels(newX, newY) musicAlarm() # Test all data modelList = [ "Xgboost", "Random_Forest", "Extra_Trees", "K_NN", "Logistic_Regression" ] # featureList = ["event_type", "log_feature", "resource_type", "severity_type"] # for tmpFeature in featureList:
# 1. read in data expNo = "013" expInfo = expNo + "_data_exploration" _basePath = Config.FolderBasePath + expInfo + Config.osSep path = _basePath + expNo + "_train_tobe.csv" testPath = _basePath + expNo + "_test_asis.csv" dr = DataReader() dr.readInCSV( path, "train") newX, newY = dr._trainDataFrame, dr._ansDataFrame # Get all best model from newX fab = ModelFactory() fab._gridSearchFlag = True fab._subFolderName = "binary" fab._n_iter_search = 50 fab._expInfo = expInfo fab.getAllModels(newX, newY) musicAlarm() # Test all data modelList = ["Xgboost","Random_Forest","Extra_Trees", "K_NN", "Logistic_Regression"] # featureList = ["event_type", "log_feature", "resource_type", "severity_type"] # for tmpFeature in featureList: # modelFolder = _basePath + "models" + Config.osSep # for tmpModel in modelList: # curModel = tmpModel
ori_testX = testX #sampleRows = np.random.choice(testX.index, len(testX)*evalDataPercentage) sampleRows = [] for i in range(0, int(len(testX) / 2)): sampleRows.append(i) test_fold_2 = testX.ix[sampleRows] test_fold_1 = testX.drop(sampleRows) clfList = [ "xgboost", "rf", "extra_tree", ] fab = ModelFactory() fab._gridSearchFlag = True dfUpper = pd.DataFrame() dfTestUpper = pd.DataFrame() eachClfLoopTimes = 3 iter_search_times = 1 for tmpClfName in clfList: for i in range(0, eachClfLoopTimes): fab._subFolderName = tmpClfName fab._n_iter_search = iter_search_times fab._expInfo = expInfo if tmpClfName == "rf": clf = fab.getRandomForestClf(train_fold_1, train_fold_label_1) elif tmpClfName == "knn":
# # for i in range(0,3): # tmpIdx = ans2List[random.randint(0, len(ans2List)-1)] # tmpAnsCol = pd.DataFrame() # tmpAnsCol[0] = 2 # newX = newX.append(newX.iloc()[tmpIdx]) # newX[newX.columns[0]][len(newX)-1] = 2 # print i # # #print newX.iloc()[0] # tmpOutPath = _basePath + "location_log_feature_over_sampling.csv" # print len(newX) # newX.to_csv(tmpOutPath, sep=',', encoding='utf-8') # #print len(newX) # Get all best model from newX fab = ModelFactory() fab._setXgboostTheradToOne = True fab._gridSearchFlag = True fab._singleModelMail = True fab._subFolderName = "ismail4" fab._n_iter_search = 10 fab._expInfo = expInfo # fab.getAllModels(newX, newY) #fab.getRandomForestClf(newX, newY) fab.getXgboostClf(newX, newY) # fab.getXgboostClf(newX, newY) # log ( i , "/32 done..." ) # musicAlarm() # Test all data modelList = [
newY = dr._ansDataFrame # t = pd.get_dummies(newY) # finalList = [] # for tmp in range(0,len(t)): # tmpList =[] # for i in range(0, len(t.ix[tmp])): # tmpList.append( int( t.ix[tmp][i])) # finalList.append(tmpList) # print finalList #exit() #print len(newX) # Get all best model from newX fab = ModelFactory() fab._setXgboostTheradToOne = False fab._gridSearchFlag = True fab._subFolderName = "ismail3" fab._n_iter_search = 1 fab._expInfo = expInfo #clf = fab.getXgboostClf(newX, newY) clf = fab.getRandomForestClf(newX, newY) #print fab.getLogloss(clf,newX,newY) def llfun(act, pred): epsilon = 1e-15 pred = sp.maximum(epsilon, pred)
def exp(): expInfo = "location_only\\" _basePath = Config.FolderBasePath + expInfo doTestFlag = False path = _basePath + "train.csv" testPath = _basePath + "test10.csv" # 1. read data dr = DataReader() dr.readInCSV( path, "train") if doTestFlag == True: dr.readInCSV(testPath , "test") # 2. run models #print dr._trainDataFrame.as_matrix fab = ModelFactory() fab._gridSearchFlag = True fab._n_iter_search = 10 fab._expInfo = "location_only" X = dr._trainDataFrame Y = dr._ansDataFrame #fab.getRandomForestClf(X, Y) #fab.getAllModels(dr._trainDataFrame, dr._ansDataFrame) # log( "xgb start") # param = {'max_depth':10, 'n_estimators':300 , 'num_class':3, 'learning_rate':0.05, 'objective':'multi:softprob'} # num_round = 5 #gbm = xgb.XGBClassifier(max_depth=10, n_estimators=300, learning_rate=0.05, objective='multi:softprob').fit(dr._trainDataFrame, dr._ansDataFrame) #testResult = gbm.predict_proba(dr._testDataFrame) #print testResult # gbm = xgb.XGBClassifier(max_depth=10, n_estimators=300, learning_rate=0.05, objective='multi:softprob') # scores = cross_val_score(rfClf, dr._trainDataFrame, dr._ansDataFrame, n_jobs = -1) # log( "xgboost Validation Precision: ", scores.mean() ) #xgbCv = xgb.cv(param, xgb.DMatrix(dr._trainDataFrame, dr._ansDataFrame), num_round, nfold=5,metrics={'error'}, seed = 0) #gbTrain = gbm.fit(dr._trainDataFrame, dr._ansDataFrame) #joblib.dump(gbTrain, xgbModelPath) #clf = joblib.load( xgbModelPath ) #clf.predict_proba(dr._testDataFrame) #xgb.save(gbm, xgbModelPath) #print xgbCv #print "xgb end" #gbm = joblib.load( xgbModelPath ) #finalClf = gbm if doTestFlag == True: print finalClf.predict_proba(dr._testDataFrame) # featureImportance =[] # for i in range(0,len(finalClf.feature_importances_)): # if i != len(dr._trainDataFrame.columns): # if (dr._trainDataFrame.columns[i]).find("_one_hot") == -1: # featureImportance.append( [dr._trainDataFrame.columns[i] , finalClf.feature_importances_[i]] ) # # print featureImportance # featureImportance.sort(lambda x, y: cmp(x[1], y[1]), reverse=True) # print featureImportance if doTestFlag == True: return finalClf.predict_proba(dr._testDataFrame)