예제 #1
0
    def report(self, grid_scores, clfName, bestLogLoss, n_top=3):
        top_scores = sorted(grid_scores, key=itemgetter(1),
                            reverse=True)[:n_top]
        bestParameters = {}
        mailContent = ""
        for i, score in enumerate(top_scores):

            log("Model with rank: {0}".format(i + 1))
            log("Mean validation score: {0:.3f} (std: {1:.3f})".format(
                score.mean_validation_score,
                np.std(score.cv_validation_scores)))
            log("Parameters: {0}".format(score.parameters))

            mailContent += str("Model with rank: {0}".format(i + 1))
            mailContent += "\n"
            mailContent += str(
                "Mean validation score: {0:.3f} (std: {1:.3f})".format(
                    score.mean_validation_score,
                    np.std(score.cv_validation_scores)))
            mailContent += "\n"
            mailContent += str("Parameters: {0}".format(score.parameters))
            mailContent += "\n"

            if i == 0:
                self._bestScoreDict[clfName] = score.mean_validation_score
                mailContent += str("Best CV score: ") + str(
                    score.mean_validation_score)
                mailContent += "\n"

            log("")
        #log (clfName , " best logloss: ", bestLogLoss)
        if (self._singleModelMail == True):
            mail("Single Model Done: ", clfName, ", ", mailContent)
        return bestParameters
예제 #2
0
    def autoFlow(self, numIter, outputPath):
        log("Start blending autoFlow, num of Iter: ", numIter)
        start = time.time()
        distinctModels = len(self._clfNameList)
        tmpResultList = []
        tmpRandomWeightList = []
        tmpBlendedDfList = []
        for i in range(0, numIter):
            tmpWeightList = self.getRandomWeightList(distinctModels)
            tmpRandomWeightList.append(tmpWeightList)
            tmpDf = self.doBlending(tmpWeightList)
            tmpBlendedDfList.append(tmpDf)
            tmpResultList.append(self.calLogLoss(tmpDf))

        idList = np.array(tmpResultList).argsort()[:3]
        firstFlag = True
        finalDf = []
        logResult = []
        for id in idList:
            if firstFlag == True:
                finalDf = tmpBlendedDfList[id]
                self._bestParamList = tmpRandomWeightList[id]
                firstFlag = False
            log("logloss: ", tmpResultList[id], "blender param: ",
                tmpRandomWeightList[id])
            logResult.append((tmpResultList[id], tmpRandomWeightList[id]))
        mail("Blender Top3: ", logResult, self._clfNameList)
        log("clfNameList = ", self._clfNameList)
        log("low prob. id list (in 1st): #", len(self._lowProbIdList), ", ",
            self._lowProbIdList)
        log("End blending autoFlow, num of Iter: ", numIter, " cost: ",
            time.time() - start, " sec")

        finalDf.to_csv(outputPath, sep=',', encoding='utf-8')
예제 #3
0
파일: Blender.py 프로젝트: kusogray/Telstra
 def autoFlow (self, numIter, outputPath):
     log("Start blending autoFlow, num of Iter: " , numIter)
     start = time.time()
     distinctModels = len(self._clfNameList)
     tmpResultList =[]
     tmpRandomWeightList =[]
     tmpBlendedDfList =[]
     for i in range (0, numIter):
         tmpWeightList = self.getRandomWeightList(distinctModels)
         tmpRandomWeightList.append(tmpWeightList)
         tmpDf = self.doBlending(tmpWeightList)
         tmpBlendedDfList.append(tmpDf)
         tmpResultList.append(self.calLogLoss(tmpDf))
 
     
     idList = np.array(tmpResultList).argsort()[:3]
     firstFlag = True
     finalDf = []
     logResult =[]
     for id in idList:
         if firstFlag == True:
             finalDf = tmpBlendedDfList[id]
             self._bestParamList = tmpRandomWeightList[id]
             firstFlag = False
         log ("logloss: " , tmpResultList[id] , "blender param: " , tmpRandomWeightList[id])
         logResult.append ( (tmpResultList[id] , tmpRandomWeightList[id]))
     mail("Blender Top3: " ,logResult,  self._clfNameList)
     log("clfNameList = ", self._clfNameList)
     log ("low prob. id list (in 1st): #", len(self._lowProbIdList) , ", ", self._lowProbIdList)
     log("End blending autoFlow, num of Iter: " , numIter, " cost: ", time.time() - start , " sec") 
     
     finalDf.to_csv(outputPath, sep=',', encoding='utf-8')  
예제 #4
0
 def report(self, grid_scores, clfName, bestLogLoss, n_top=3):
     top_scores = sorted(grid_scores, key=itemgetter(1), reverse=True)[:n_top]
     bestParameters = {}
     mailContent = ""
     for i, score in enumerate(top_scores):
         
         log("Model with rank: {0}".format(i + 1))
         log("Mean validation score: {0:.3f} (std: {1:.3f})".format(
               score.mean_validation_score,
               np.std(score.cv_validation_scores)))
         log("Parameters: {0}".format(score.parameters))
         
         mailContent += str("Model with rank: {0}".format(i + 1)  )
         mailContent += "\n"
         mailContent += str("Mean validation score: {0:.3f} (std: {1:.3f})".format(
               score.mean_validation_score,
               np.std(score.cv_validation_scores))   )
         mailContent += "\n"
         mailContent += str("Parameters: {0}".format(score.parameters)  )
         mailContent += "\n"
                 
         if i == 0:
             self._bestScoreDict[clfName] = score.mean_validation_score
             mailContent += str("Best CV score: ") + str ( score.mean_validation_score )
             mailContent += "\n"
             
         log("")
     #log (clfName , " best logloss: ", bestLogLoss)
     if (self._singleModelMail == True):
         mail("Single Model Done: ", clfName , ", ", mailContent)
     return bestParameters
예제 #5
0
 def genXgboostRpt(self, bestClf, bestScore, paramList, best_num_round):
     dumpModel(bestClf, "Xgboost", self._expInfo, self._subFolderName)
     log("Native Xgboost best score : ", bestScore, ", param list: ",
         paramList, "best_num_round: ", best_num_round)
     if self._singleModelMail == True:
         mail(
             "Xgboost Done",
             "Native Xgboost best score : " + str(bestScore) +
             ", param list: " + str(paramList) + "best_num_round: ",
             best_num_round)
예제 #6
0
 def getAllModels(self, X, Y):
     
     log("GetAllModels start with iteration numbers: " , self._n_iter_search)
     start = time.time()
     
     self._basicClf["Xgboost"] = self.getXgboostClf(X, Y)
     self._basicClf["Random_Forest"] = self.getRandomForestClf(X, Y)
     self._basicClf["Extra_Trees"] = self.getExtraTressClf(X, Y)
     
     if not self._onlyTreeBasedModels:
         self._basicClf["K_NN"] = self.getKnnClf(X, Y)
         self._basicClf["Logistic_Regression"] = self.getLogisticRegressionClf(X, Y)
         self._basicClf["Naive_Bayes"] = self.getNaiveBayesClf(X, Y)
     
     
     log("GetAllModels cost: " , time.time() - start , " sec")
     log(sorted(self._bestScoreDict.items(), key=lambda x: x[1] , reverse=True))
     mail(self._expInfo, sorted(self._bestScoreDict.items(), key=lambda x: x[1] , reverse=True) )
     log(self._expInfo, sorted(self._bestScoreDict.items(), key=lambda x: x[1] , reverse=True) )
     bestScoreList = sorted(self._bestScoreDict.items(), key=lambda x: x[1] , reverse=True)
     log("MVP clf is : ", bestScoreList[0][0])
     self._mvpClf = self._bestClf[bestScoreList[0][0]]
     log("GetAllModels end with iteration numbers: " , self._n_iter_search)
예제 #7
0
    def getAllModels(self, X, Y):

        log("GetAllModels start with iteration numbers: ", self._n_iter_search)
        start = time.time()

        self._basicClf["Xgboost"] = self.getXgboostClf(X, Y)
        self._basicClf["Random_Forest"] = self.getRandomForestClf(X, Y)
        self._basicClf["Extra_Trees"] = self.getExtraTressClf(X, Y)

        if not self._onlyTreeBasedModels:
            self._basicClf["K_NN"] = self.getKnnClf(X, Y)
            self._basicClf[
                "Logistic_Regression"] = self.getLogisticRegressionClf(X, Y)
            self._basicClf["Naive_Bayes"] = self.getNaiveBayesClf(X, Y)

        log("GetAllModels cost: ", time.time() - start, " sec")
        log(
            sorted(self._bestScoreDict.items(),
                   key=lambda x: x[1],
                   reverse=True))
        mail(
            self._expInfo,
            sorted(self._bestScoreDict.items(),
                   key=lambda x: x[1],
                   reverse=True))
        log(
            self._expInfo,
            sorted(self._bestScoreDict.items(),
                   key=lambda x: x[1],
                   reverse=True))
        bestScoreList = sorted(self._bestScoreDict.items(),
                               key=lambda x: x[1],
                               reverse=True)
        log("MVP clf is : ", bestScoreList[0][0])
        self._mvpClf = self._bestClf[bestScoreList[0][0]]
        log("GetAllModels end with iteration numbers: ", self._n_iter_search)
예제 #8
0
 def genXgboostRpt(self, bestClf, bestScore, paramList, best_num_round):
     dumpModel(bestClf, "Xgboost", self._expInfo, self._subFolderName)
     log("Native Xgboost best score : ", bestScore, ", param list: ", paramList, "best_num_round: ", best_num_round)
     if self._singleModelMail == True:
         mail("Xgboost Done" ,"Native Xgboost best score : " + str( bestScore) + ", param list: " + str( paramList) + "best_num_round: ", best_num_round)