# 4. test all data, output 3 ans as features # D:\Kaggle\Telstra\004_one_hot_resource_type\(Xgboost)_(2016-02-06_11_14_31).model # D:\Kaggle\Telstra\004_one_hot_resource_type\(Random_Forest)_(2016-02-06_11_24_09).model # D:\Kaggle\Telstra\004_one_hot_resource_type\(Extra_Trees)_(2016-02-06_11_30_52).model # D:\Kaggle\Telstra\004_one_hot_resource_type\(K_NN)_(2016-02-06_11_40_10).model modelFolder = _basePath + "models" + Config.osSep + "stacked" + Config.osSep clfNameList = [] clfNameList.append("Extra_Trees") clfNameList.append("K_NN") clfNameList.append("Random_Forest") clfNameList.append("Xgboost") clfNameList.append("Logistic_Regression") testCsv = _basePath + "010_train_tobe.csv" dr = DataReader() newX, testY = dr.cvtPathListToDfList(testCsv, "train") for curModel in clfNameList: modelPath = modelFolder + str(getMatchNameModelPath(modelFolder, curModel)) tmpOutPath = _basePath + expNo + "_blender" + curModel + "_train.csv" tmpClf = loadModel( modelPath) log(tmpClf.predict_proba(newX)) #outDf = pd.concat([newX, pd.DataFrame(tmpClf.predict_proba(newX))], axis=1) outDf = pd.DataFrame(tmpClf.predict_proba(newX)) outDf.to_csv(tmpOutPath, sep=',', encoding='utf-8') #musicAlarm()
# 1. read in data expNo = "008" expInfo = expNo + "_blender" _basePath = Config.FolderBasePath + expInfo + Config.osSep doTestFlag = False path = _basePath + expNo + "_train_tobe.csv" testPath = _basePath + expNo + "_test_tobe.csv" outputPath = _basePath + expNo + "blender_train.csv" # 1. read data dr = DataReader() tmpDfList = [] tmpPath = _basePath + "008_submission_1_train_Extra_Trees.csv" newX, newY = dr.cvtPathListToDfList(tmpPath, "train") tmpDfList.append(newX) tmpPath = _basePath + "008_submission_1_train_K_NN.csv" newX, newY = dr.cvtPathListToDfList(tmpPath, "train") tmpDfList.append(newX) tmpPath = _basePath + "008_submission_1_train_Random_Forest.csv" newX, newY = dr.cvtPathListToDfList(tmpPath, "train") tmpDfList.append(newX) tmpPath = _basePath + "008_submission_1_train_Xgboost.csv" newX, newY = dr.cvtPathListToDfList(tmpPath, "train") tmpDfList.append(newX) clfNameList = []
[0.00310559, 0.53881988, 0.03416149, 0.4052795, 0.01863354], [0.13333333, 0.01333333, 0.01142857, 0.73142857, 0.11047619], [0.08222222, 0.00222222, 0.00222222, 0.00222222, 0.91111111], ] # for tmpFeature in featureList: # dr = DataReader() # tmpPath = _basePath + "010_blender_" + tmpFeature + "_train.csv" # newX, tmpY = dr.cvtPathListToDfList(tmpPath, "train") # tmpDf = pd.concat([tmpDf, newX], axis=1) # # tmpDf.to_csv(tmpOutPath, sep=',', encoding='utf-8') tmpI, tmpJ = 0, 0 dr = DataReader() baseDf, ansY = dr.cvtPathListToDfList(_basePath + "010_blenderXgboost_train.csv", "train") tmpOutPath = _basePath + "010_train_last_blender.csv" tmpFeatureBlendedAns = pd.DataFrame() baseDf = pd.DataFrame() tmpDfList = [] for tmpClfName in clfNameList: dr = DataReader() tmpPath = _basePath + "010_" + "blender" + tmpClfName + "_train.csv" newX, tmpY = dr.cvtPathListToDfList(tmpPath, "train") tmpDfList.append(newX) b1 = Blender(clfNameList, tmpDfList, ansY) b1.autoFlow(2000, tmpOutPath) # test
for i2 in range(2,len(bin(i))): flagList.append(int(bin(i)[i2])) for j in range(0,5): if flagList[j] ==1: tmpCurFeatureList.append(featureList[j]) log(tmpCurFeatureList) newX = pd.DataFrame() for tmpFeature in tmpCurFeatureList: path = _basePath + tmpFeature + "_train.csv" dr = DataReader() tmpX = dr.cvtPathListToDfList(path, "test") newX = pd.concat([newX, tmpX], axis=1) #log("feature len: " , len(newX)) # Get all best model from newX fab = ModelFactory() fab._setXgboostTheradToOne = True fab._gridSearchFlag = True fab._onlyTreeBasedModels = True fab._subFolderName = "one_hot_each_" + str(i) fab._n_iter_search = 30 fab._expInfo = expInfo # fab.getAllModels(newX, newY) fab.getRandomForestClf(newX, newY) # fab.getXgboostClf(newX, newY) log ( i , "/32 done..." )
[0.08222222, 0.00222222, 0.00222222, 0.00222222, 0.91111111] ] tmpOutPath = _basePath + "010_test_tobe.csv" # for tmpFeature in featureList: # dr = DataReader() # tmpPath = _basePath + "010_blender_" + tmpFeature + "_train.csv" # newX, tmpY = dr.cvtPathListToDfList(tmpPath, "train") # tmpDf = pd.concat([tmpDf, newX], axis=1) # # tmpDf.to_csv(tmpOutPath, sep=',', encoding='utf-8') tmpI, tmpJ = 0, 0 dr = DataReader() baseDf, tmpY = dr.cvtPathListToDfList(_basePath + "010_test_asis.csv", "test") for tmpFeature in featureList: outputPath = _basePath + expNo + "_blender_" + tmpFeature + "_test.csv" #ansPath = _basePath + "010_Extra_Trees_stack_event_type.csv" #dr = DataReader() #tmpX, ansY = dr.cvtPathListToDfList(ansPath, "train") #tmpDfList = [] tmpFeatureBlendedAns = pd.DataFrame() for tmpClfName in clfNameList: dr = DataReader() tmpPath = _basePath + "010_" + tmpClfName + "_stack_" + tmpFeature + "_test.csv" newX, tmpY = dr.cvtPathListToDfList(tmpPath, "train") tmpWight = tmpWeightList[tmpI][tmpJ] newX = newX.multiply(tmpWight)
[0.00310559, 0.53881988, 0.03416149, 0.4052795, 0.01863354], [0.13333333, 0.01333333, 0.01142857, 0.73142857, 0.11047619], [0.08222222, 0.00222222, 0.00222222, 0.00222222, 0.91111111] ] # for tmpFeature in featureList: # dr = DataReader() # tmpPath = _basePath + "010_blender_" + tmpFeature + "_train.csv" # newX, tmpY = dr.cvtPathListToDfList(tmpPath, "train") # tmpDf = pd.concat([tmpDf, newX], axis=1) # # tmpDf.to_csv(tmpOutPath, sep=',', encoding='utf-8') tmpI, tmpJ = 0, 0 dr = DataReader() baseDf, ansY = dr.cvtPathListToDfList( _basePath + "010_blenderXgboost_train.csv", "train") tmpOutPath = _basePath + "010_train_last_blender.csv" tmpFeatureBlendedAns = pd.DataFrame() baseDf = pd.DataFrame() tmpDfList = [] for tmpClfName in clfNameList: dr = DataReader() tmpPath = _basePath + "010_" + "blender" + tmpClfName + "_train.csv" newX, tmpY = dr.cvtPathListToDfList(tmpPath, "train") tmpDfList.append(newX) b1 = Blender(clfNameList, tmpDfList, ansY) b1.autoFlow(2000, tmpOutPath) # test
# Get all best model from newX fab = ModelFactory() fab._gridSearchFlag = True # fab._subFolderName = "stacked" fab._n_iter_search = 100 fab._expInfo = expInfo fab.getAllModels(newX, newY) # Test all data modelList = ["Xgboost","Random_Forest","Extra_Trees", "K_NN", "Logistic_Regression"] # featureList = ["event_type", "log_feature", "resource_type", "severity_type"] # for tmpFeature in featureList: modelFolder = _basePath + "models" + Config.osSep for tmpModel in modelList: curModel = tmpModel dr = DataReader() newX = dr.cvtPathListToDfList(testPath, "test") modelPath = modelFolder + str(getMatchNameModelPath(modelFolder, curModel)) tmpOutPath = _basePath + "011_" + curModel + "_test_ans.csv" tmpClf = loadModel( modelPath) log(tmpClf.predict_proba(newX)) outDf = pd.concat([newX, pd.DataFrame(tmpClf.predict_proba(newX))], axis=1) outDf = pd.DataFrame(tmpClf.predict_proba(newX)) outDf.to_csv(tmpOutPath, sep=',', encoding='utf-8') #musicAlarm() # log("004 Done")
[0.08222222, 0.00222222, 0.00222222, 0.00222222, 0.91111111], ] tmpOutPath = _basePath + "010_test_tobe.csv" # for tmpFeature in featureList: # dr = DataReader() # tmpPath = _basePath + "010_blender_" + tmpFeature + "_train.csv" # newX, tmpY = dr.cvtPathListToDfList(tmpPath, "train") # tmpDf = pd.concat([tmpDf, newX], axis=1) # # tmpDf.to_csv(tmpOutPath, sep=',', encoding='utf-8') tmpI, tmpJ = 0, 0 dr = DataReader() baseDf, tmpY = dr.cvtPathListToDfList(_basePath + "010_test_asis.csv", "test") for tmpFeature in featureList: outputPath = _basePath + expNo + "_blender_" + tmpFeature + "_test.csv" # ansPath = _basePath + "010_Extra_Trees_stack_event_type.csv" # dr = DataReader() # tmpX, ansY = dr.cvtPathListToDfList(ansPath, "train") # tmpDfList = [] tmpFeatureBlendedAns = pd.DataFrame() for tmpClfName in clfNameList: dr = DataReader() tmpPath = _basePath + "010_" + tmpClfName + "_stack_" + tmpFeature + "_test.csv" newX, tmpY = dr.cvtPathListToDfList(tmpPath, "train") tmpWight = tmpWeightList[tmpI][tmpJ] newX = newX.multiply(tmpWight)
# 4. test all data, output 3 ans as features # D:\Kaggle\Telstra\004_one_hot_resource_type\(Xgboost)_(2016-02-06_11_14_31).model # D:\Kaggle\Telstra\004_one_hot_resource_type\(Random_Forest)_(2016-02-06_11_24_09).model # D:\Kaggle\Telstra\004_one_hot_resource_type\(Extra_Trees)_(2016-02-06_11_30_52).model # D:\Kaggle\Telstra\004_one_hot_resource_type\(K_NN)_(2016-02-06_11_40_10).model modelFolder = _basePath + "models" + Config.osSep + "stacked" + Config.osSep clfNameList = [] clfNameList.append("Extra_Trees") clfNameList.append("K_NN") clfNameList.append("Random_Forest") clfNameList.append("Xgboost") clfNameList.append("Logistic_Regression") testCsv = _basePath + "010_train_tobe.csv" dr = DataReader() newX, testY = dr.cvtPathListToDfList(testCsv, "train") for curModel in clfNameList: modelPath = modelFolder + str( getMatchNameModelPath(modelFolder, curModel)) tmpOutPath = _basePath + expNo + "_blender" + curModel + "_train.csv" tmpClf = loadModel(modelPath) log(tmpClf.predict_proba(newX)) #outDf = pd.concat([newX, pd.DataFrame(tmpClf.predict_proba(newX))], axis=1) outDf = pd.DataFrame(tmpClf.predict_proba(newX)) outDf.to_csv(tmpOutPath, sep=',', encoding='utf-8') #musicAlarm()