def report(self, grid_scores, clfName, bestLogLoss, n_top=3): top_scores = sorted(grid_scores, key=itemgetter(1), reverse=True)[:n_top] bestParameters = {} mailContent = "" for i, score in enumerate(top_scores): log("Model with rank: {0}".format(i + 1)) log("Mean validation score: {0:.3f} (std: {1:.3f})".format( score.mean_validation_score, np.std(score.cv_validation_scores))) log("Parameters: {0}".format(score.parameters)) mailContent += str("Model with rank: {0}".format(i + 1)) mailContent += "\n" mailContent += str( "Mean validation score: {0:.3f} (std: {1:.3f})".format( score.mean_validation_score, np.std(score.cv_validation_scores))) mailContent += "\n" mailContent += str("Parameters: {0}".format(score.parameters)) mailContent += "\n" if i == 0: self._bestScoreDict[clfName] = score.mean_validation_score mailContent += str("Best CV score: ") + str( score.mean_validation_score) mailContent += "\n" log("") #log (clfName , " best logloss: ", bestLogLoss) if (self._singleModelMail == True): mail("Single Model Done: ", clfName, ", ", mailContent) return bestParameters
def autoFlow(self, numIter, outputPath): log("Start blending autoFlow, num of Iter: ", numIter) start = time.time() distinctModels = len(self._clfNameList) tmpResultList = [] tmpRandomWeightList = [] tmpBlendedDfList = [] for i in range(0, numIter): tmpWeightList = self.getRandomWeightList(distinctModels) tmpRandomWeightList.append(tmpWeightList) tmpDf = self.doBlending(tmpWeightList) tmpBlendedDfList.append(tmpDf) tmpResultList.append(self.calLogLoss(tmpDf)) idList = np.array(tmpResultList).argsort()[:3] firstFlag = True finalDf = [] logResult = [] for id in idList: if firstFlag == True: finalDf = tmpBlendedDfList[id] self._bestParamList = tmpRandomWeightList[id] firstFlag = False log("logloss: ", tmpResultList[id], "blender param: ", tmpRandomWeightList[id]) logResult.append((tmpResultList[id], tmpRandomWeightList[id])) mail("Blender Top3: ", logResult, self._clfNameList) log("clfNameList = ", self._clfNameList) log("low prob. id list (in 1st): #", len(self._lowProbIdList), ", ", self._lowProbIdList) log("End blending autoFlow, num of Iter: ", numIter, " cost: ", time.time() - start, " sec") finalDf.to_csv(outputPath, sep=',', encoding='utf-8')
def autoFlow (self, numIter, outputPath): log("Start blending autoFlow, num of Iter: " , numIter) start = time.time() distinctModels = len(self._clfNameList) tmpResultList =[] tmpRandomWeightList =[] tmpBlendedDfList =[] for i in range (0, numIter): tmpWeightList = self.getRandomWeightList(distinctModels) tmpRandomWeightList.append(tmpWeightList) tmpDf = self.doBlending(tmpWeightList) tmpBlendedDfList.append(tmpDf) tmpResultList.append(self.calLogLoss(tmpDf)) idList = np.array(tmpResultList).argsort()[:3] firstFlag = True finalDf = [] logResult =[] for id in idList: if firstFlag == True: finalDf = tmpBlendedDfList[id] self._bestParamList = tmpRandomWeightList[id] firstFlag = False log ("logloss: " , tmpResultList[id] , "blender param: " , tmpRandomWeightList[id]) logResult.append ( (tmpResultList[id] , tmpRandomWeightList[id])) mail("Blender Top3: " ,logResult, self._clfNameList) log("clfNameList = ", self._clfNameList) log ("low prob. id list (in 1st): #", len(self._lowProbIdList) , ", ", self._lowProbIdList) log("End blending autoFlow, num of Iter: " , numIter, " cost: ", time.time() - start , " sec") finalDf.to_csv(outputPath, sep=',', encoding='utf-8')
def report(self, grid_scores, clfName, bestLogLoss, n_top=3): top_scores = sorted(grid_scores, key=itemgetter(1), reverse=True)[:n_top] bestParameters = {} mailContent = "" for i, score in enumerate(top_scores): log("Model with rank: {0}".format(i + 1)) log("Mean validation score: {0:.3f} (std: {1:.3f})".format( score.mean_validation_score, np.std(score.cv_validation_scores))) log("Parameters: {0}".format(score.parameters)) mailContent += str("Model with rank: {0}".format(i + 1) ) mailContent += "\n" mailContent += str("Mean validation score: {0:.3f} (std: {1:.3f})".format( score.mean_validation_score, np.std(score.cv_validation_scores)) ) mailContent += "\n" mailContent += str("Parameters: {0}".format(score.parameters) ) mailContent += "\n" if i == 0: self._bestScoreDict[clfName] = score.mean_validation_score mailContent += str("Best CV score: ") + str ( score.mean_validation_score ) mailContent += "\n" log("") #log (clfName , " best logloss: ", bestLogLoss) if (self._singleModelMail == True): mail("Single Model Done: ", clfName , ", ", mailContent) return bestParameters
def genXgboostRpt(self, bestClf, bestScore, paramList, best_num_round): dumpModel(bestClf, "Xgboost", self._expInfo, self._subFolderName) log("Native Xgboost best score : ", bestScore, ", param list: ", paramList, "best_num_round: ", best_num_round) if self._singleModelMail == True: mail( "Xgboost Done", "Native Xgboost best score : " + str(bestScore) + ", param list: " + str(paramList) + "best_num_round: ", best_num_round)
def getAllModels(self, X, Y): log("GetAllModels start with iteration numbers: " , self._n_iter_search) start = time.time() self._basicClf["Xgboost"] = self.getXgboostClf(X, Y) self._basicClf["Random_Forest"] = self.getRandomForestClf(X, Y) self._basicClf["Extra_Trees"] = self.getExtraTressClf(X, Y) if not self._onlyTreeBasedModels: self._basicClf["K_NN"] = self.getKnnClf(X, Y) self._basicClf["Logistic_Regression"] = self.getLogisticRegressionClf(X, Y) self._basicClf["Naive_Bayes"] = self.getNaiveBayesClf(X, Y) log("GetAllModels cost: " , time.time() - start , " sec") log(sorted(self._bestScoreDict.items(), key=lambda x: x[1] , reverse=True)) mail(self._expInfo, sorted(self._bestScoreDict.items(), key=lambda x: x[1] , reverse=True) ) log(self._expInfo, sorted(self._bestScoreDict.items(), key=lambda x: x[1] , reverse=True) ) bestScoreList = sorted(self._bestScoreDict.items(), key=lambda x: x[1] , reverse=True) log("MVP clf is : ", bestScoreList[0][0]) self._mvpClf = self._bestClf[bestScoreList[0][0]] log("GetAllModels end with iteration numbers: " , self._n_iter_search)
def getAllModels(self, X, Y): log("GetAllModels start with iteration numbers: ", self._n_iter_search) start = time.time() self._basicClf["Xgboost"] = self.getXgboostClf(X, Y) self._basicClf["Random_Forest"] = self.getRandomForestClf(X, Y) self._basicClf["Extra_Trees"] = self.getExtraTressClf(X, Y) if not self._onlyTreeBasedModels: self._basicClf["K_NN"] = self.getKnnClf(X, Y) self._basicClf[ "Logistic_Regression"] = self.getLogisticRegressionClf(X, Y) self._basicClf["Naive_Bayes"] = self.getNaiveBayesClf(X, Y) log("GetAllModels cost: ", time.time() - start, " sec") log( sorted(self._bestScoreDict.items(), key=lambda x: x[1], reverse=True)) mail( self._expInfo, sorted(self._bestScoreDict.items(), key=lambda x: x[1], reverse=True)) log( self._expInfo, sorted(self._bestScoreDict.items(), key=lambda x: x[1], reverse=True)) bestScoreList = sorted(self._bestScoreDict.items(), key=lambda x: x[1], reverse=True) log("MVP clf is : ", bestScoreList[0][0]) self._mvpClf = self._bestClf[bestScoreList[0][0]] log("GetAllModels end with iteration numbers: ", self._n_iter_search)
def genXgboostRpt(self, bestClf, bestScore, paramList, best_num_round): dumpModel(bestClf, "Xgboost", self._expInfo, self._subFolderName) log("Native Xgboost best score : ", bestScore, ", param list: ", paramList, "best_num_round: ", best_num_round) if self._singleModelMail == True: mail("Xgboost Done" ,"Native Xgboost best score : " + str( bestScore) + ", param list: " + str( paramList) + "best_num_round: ", best_num_round)