newX = pd.DataFrame() for tmpFeature in tmpCurFeatureList: path = _basePath + tmpFeature + "_train.csv" dr = DataReader() tmpX = dr.cvtPathListToDfList(path, "test") newX = pd.concat([newX, tmpX], axis=1) #log("feature len: " , len(newX)) # Get all best model from newX fab = ModelFactory() fab._setXgboostTheradToOne = True fab._gridSearchFlag = True fab._onlyTreeBasedModels = True fab._subFolderName = "one_hot_each_" + str(i) fab._n_iter_search = 30 fab._expInfo = expInfo # fab.getAllModels(newX, newY) fab.getRandomForestClf(newX, newY) # fab.getXgboostClf(newX, newY) log ( i , "/32 done..." ) musicAlarm() # Test all data modelList = ["Xgboost","Random_Forest","Extra_Trees", "K_NN", "Logistic_Regression"]
# newX = newX.append(newX.iloc()[tmpIdx]) # newX[newX.columns[0]][len(newX)-1] = 2 # print i # # #print newX.iloc()[0] # tmpOutPath = _basePath + "location_log_feature_over_sampling.csv" # print len(newX) # newX.to_csv(tmpOutPath, sep=',', encoding='utf-8') # #print len(newX) # Get all best model from newX fab = ModelFactory() fab._setXgboostTheradToOne = True fab._gridSearchFlag = True fab._singleModelMail = True fab._subFolderName = "ismail4" fab._n_iter_search = 10 fab._expInfo = expInfo # fab.getAllModels(newX, newY) #fab.getRandomForestClf(newX, newY) fab.getXgboostClf(newX, newY) # fab.getXgboostClf(newX, newY) # log ( i , "/32 done..." ) # musicAlarm() # Test all data
# drAns = DataReader() # drAns.readInCSV(ansPath, "train") # newY = drAns._ansDataFrame tmpPath = _basePath + "train_merge_one_hot.csv" dr = DataReader() dr.readInCSV(tmpPath, "train") newX = dr._trainDataFrame newY = dr._ansDataFrame fab = ModelFactory() #fab._setXgboostTheradToOne = True fab._gridSearchFlag = True fab._singleModelMail = True fab._subFolderName = "groupby_sum" fab._n_iter_search = 1 fab._expInfo = expInfo clf = fab.getXgboostClf(newX, newY) # tmpPath = _basePath + "test_merge_one_hot" + ".csv" dr = DataReader() dr.readInCSV(tmpPath, "test") newX = dr._testDataFrame newX = xgb.DMatrix(newX) tmpOutPath = _basePath + expNo +"_" + "Xgboost_" + "groupby_sum"+ "_ans.csv" log(clf.predict(newX)) outDf = pd.DataFrame(clf.predict(newX)) outDf.to_csv(tmpOutPath, sep=',', encoding='utf-8') musicAlarm()
# 1. read in data expNo = "013" expInfo = expNo + "_data_exploration" _basePath = Config.FolderBasePath + expInfo + Config.osSep path = _basePath + expNo + "_train_tobe.csv" testPath = _basePath + expNo + "_test_asis.csv" dr = DataReader() dr.readInCSV(path, "train") newX, newY = dr._trainDataFrame, dr._ansDataFrame # Get all best model from newX fab = ModelFactory() fab._gridSearchFlag = True fab._subFolderName = "binary" fab._n_iter_search = 50 fab._expInfo = expInfo fab.getAllModels(newX, newY) musicAlarm() # Test all data modelList = [ "Xgboost", "Random_Forest", "Extra_Trees", "K_NN", "Logistic_Regression" ] # featureList = ["event_type", "log_feature", "resource_type", "severity_type"] # for tmpFeature in featureList: # modelFolder = _basePath + "models" + Config.osSep # for tmpModel in modelList:
clfList = ["xgboost", "rf","extra_tree",] fab = ModelFactory() fab._gridSearchFlag = True dfUpper = pd.DataFrame() dfTestUpper = pd.DataFrame() eachClfLoopTimes = 3 iter_search_times = 1 for tmpClfName in clfList: for i in range(0,eachClfLoopTimes): fab._subFolderName = tmpClfName fab._n_iter_search = iter_search_times fab._expInfo = expInfo if tmpClfName == "rf": clf = fab.getRandomForestClf(train_fold_1, train_fold_label_1) elif tmpClfName == "knn": clf = fab.getKnnClf(train_fold_1, train_fold_label_1) elif tmpClfName == "extra_tree": clf = fab.getExtraTressClf(train_fold_1, train_fold_label_1) elif tmpClfName == "xgboost": clf = fab.getXgboostClf(train_fold_1, train_fold_label_1) if tmpClfName == "xgboost": predictResult = clf.predict(xgb.DMatrix(train_fold_2)) predictTestResult = clf.predict(xgb.DMatrix(test_fold_2)) else:
expInfo = expNo + "_data_exploration" _basePath = Config.FolderBasePath + expInfo + Config.osSep path = _basePath + expNo + "_train_tobe.csv" testPath = _basePath + expNo + "_test_asis.csv" dr = DataReader() dr.readInCSV( path, "train") newX, newY = dr._trainDataFrame, dr._ansDataFrame # Get all best model from newX fab = ModelFactory() fab._gridSearchFlag = True fab._subFolderName = "binary" fab._n_iter_search = 50 fab._expInfo = expInfo fab.getAllModels(newX, newY) musicAlarm() # Test all data modelList = ["Xgboost","Random_Forest","Extra_Trees", "K_NN", "Logistic_Regression"] # featureList = ["event_type", "log_feature", "resource_type", "severity_type"] # for tmpFeature in featureList: # modelFolder = _basePath + "models" + Config.osSep # for tmpModel in modelList: # curModel = tmpModel # # dr = DataReader()
"xgboost", "rf", "extra_tree", ] fab = ModelFactory() fab._gridSearchFlag = True dfUpper = pd.DataFrame() dfTestUpper = pd.DataFrame() eachClfLoopTimes = 3 iter_search_times = 1 for tmpClfName in clfList: for i in range(0, eachClfLoopTimes): fab._subFolderName = tmpClfName fab._n_iter_search = iter_search_times fab._expInfo = expInfo if tmpClfName == "rf": clf = fab.getRandomForestClf(train_fold_1, train_fold_label_1) elif tmpClfName == "knn": clf = fab.getKnnClf(train_fold_1, train_fold_label_1) elif tmpClfName == "extra_tree": clf = fab.getExtraTressClf(train_fold_1, train_fold_label_1) elif tmpClfName == "xgboost": clf = fab.getXgboostClf(train_fold_1, train_fold_label_1) if tmpClfName == "xgboost": predictResult = clf.predict(xgb.DMatrix(train_fold_2)) predictTestResult = clf.predict(xgb.DMatrix(test_fold_2)) else:
# newX = newX.append(newX.iloc()[tmpIdx]) # newX[newX.columns[0]][len(newX)-1] = 2 # print i # # #print newX.iloc()[0] # tmpOutPath = _basePath + "location_log_feature_over_sampling.csv" # print len(newX) # newX.to_csv(tmpOutPath, sep=',', encoding='utf-8') # #print len(newX) # Get all best model from newX fab = ModelFactory() fab._setXgboostTheradToOne = True fab._gridSearchFlag = True fab._singleModelMail = True fab._subFolderName = "ismail4" fab._n_iter_search = 10 fab._expInfo = expInfo # fab.getAllModels(newX, newY) #fab.getRandomForestClf(newX, newY) fab.getXgboostClf(newX, newY) # fab.getXgboostClf(newX, newY) # log ( i , "/32 done..." ) # musicAlarm() # Test all data modelList = [ "Xgboost", "Random_Forest", "Extra_Trees", "K_NN", "Logistic_Regression" ] # featureList = ["event_type", "log_feature", "resource_type", "severity_type"]