# 4. test all data, output 3 ans as features # D:\Kaggle\Telstra\004_one_hot_resource_type\(Xgboost)_(2016-02-06_11_14_31).model # D:\Kaggle\Telstra\004_one_hot_resource_type\(Random_Forest)_(2016-02-06_11_24_09).model # D:\Kaggle\Telstra\004_one_hot_resource_type\(Extra_Trees)_(2016-02-06_11_30_52).model # D:\Kaggle\Telstra\004_one_hot_resource_type\(K_NN)_(2016-02-06_11_40_10).model modelFolder = _basePath + "models" + Config.osSep + "stacked" + Config.osSep clfNameList = [] clfNameList.append("Extra_Trees") clfNameList.append("K_NN") clfNameList.append("Random_Forest") clfNameList.append("Xgboost") clfNameList.append("Logistic_Regression") testCsv = _basePath + "010_train_tobe.csv" dr = DataReader() newX, testY = dr.cvtPathListToDfList(testCsv, "train") for curModel in clfNameList: modelPath = modelFolder + str(getMatchNameModelPath(modelFolder, curModel)) tmpOutPath = _basePath + expNo + "_blender" + curModel + "_train.csv" tmpClf = loadModel( modelPath) log(tmpClf.predict_proba(newX)) #outDf = pd.concat([newX, pd.DataFrame(tmpClf.predict_proba(newX))], axis=1) outDf = pd.DataFrame(tmpClf.predict_proba(newX)) outDf.to_csv(tmpOutPath, sep=',', encoding='utf-8') #musicAlarm()
#D:\Kaggle\Telstra\004_one_hot_resource_type\(K_NN)_(2016-02-06_11_40_10).model #Logistic_Regression modelList = [ "Xgboost", "Random_Forest", "Extra_Trees", "K_NN", "Logistic_Regression" ] featureList = [ "event_type", "log_feature", "resource_type", "severity_type" ] for tmpFeature in featureList: for tmpModel in modelList: subFolder = tmpFeature curModel = tmpModel tmpCsvPath = _basePath + expNo + "_" + tmpFeature + "_test_tobe.csv" dr = DataReader() dr.readInCSV(tmpCsvPath, "train") newX = dr._trainDataFrame modelFolder = _basePath + "models" + Config.osSep + subFolder + Config.osSep modelPath = modelFolder + str( getMatchNameModelPath(modelFolder, curModel)) tmpOutPath = _basePath + "010_" + curModel + "_stack_" + subFolder + "_test.csv" tmpClf = loadModel(modelPath) log(tmpClf.predict_proba(newX)) outDf = pd.concat( [newX, pd.DataFrame(tmpClf.predict_proba(newX))], axis=1) outDf = pd.DataFrame(tmpClf.predict_proba(newX)) outDf.to_csv(tmpOutPath, sep=',', encoding='utf-8') #musicAlarm() # log("004 Done")
newX, newY = dr._trainDataFrame, dr._ansDataFrame dr2 = DataReader() dr2.readInCSV(testPath, "test") #newX = dr2._testDataFrame dr3 = DataReader() dr3.readInCSV(testSortIdPath, "test") sortIdDf = dr3._testDataFrame dr4 = DataReader() dr4.readInCSV(trainSortIdPath, "test") sortIdDf = dr4._testDataFrame modelFolder = _basePath + "models" + Config.osSep + "binary" + Config.osSep curModel = "Xgboost" modelPath = modelFolder + str(getMatchNameModelPath(modelFolder, curModel)) tmpOutPath = _basePath + expNo + "_" + curModel + "_test_ans.csv" tmpClf = loadModel(modelPath) log(tmpClf.predict_proba(newX)) ans = tmpClf.predict_proba(newX) ansList = [] for i, tmpAns in enumerate(dr._ansDataFrame): if ans[i][tmpAns] < 0.35: #log( "id: " + sortIdDf[sortIdDf.columns[0]][i] + ", prob: " + ans[i][tmpAns], ", cate: " + tmpAns) log((sortIdDf[sortIdDf.columns[0]][i], ans[i][tmpAns], tmpAns)) ansList.append( (sortIdDf[sortIdDf.columns[0]][i], ans[i][tmpAns], tmpAns)) log(len(ansList))