Esempio n. 1
0
class EDA:
    log = Logger(name='MLFramework')

    @staticmethod
    def analysis(df, targetfeat, config):
        pairwise_analysis = 'on'  #相關性和其他型別的資料關聯可能需要花費較長時間。如果超過了某個閾值,就需要設定這個引數為on或者off,以判斷是否需要分析資料相關性。
        report_train = sv.analyze([df, 'train'],
                                  target_feat=targetfeat,
                                  pairwise_analysis=pairwise_analysis)
        report_train.show_html(
            filepath='./report/{}_AnalysisReport.html'.fomrmat(
                config.modelFileKey))  # 儲存為html的格式

    @staticmethod
    def compare(df_train, df_test, targetfeat, config):
        pairwise_analysis = 'on'  #相關性和其他型別的資料關聯可能需要花費較長時間。如果超過了某個閾值,就需要設定這個引數為on或者off,以判斷是否需要分析資料相關性。
        compare_subsets_report = sv.compare(
            [df_train, 'Train'],  # 使用compare
            [df_test, 'Test'],
            target_feat=targetfeat,
            pairwise_analysis=pairwise_analysis)

        compare_subsets_report.show_html(
            filepath='./report/{}_CompareReport.html'.format(
                config.modelFileKey))
Esempio n. 2
0
    def toggle_debug_mode(self, enabled):
        self.debug_mode = enabled
        if enabled:
            self.debug_folder = DEBUG_FOLDER_ROOT + '/' + str(
                time.time()) + '/'
            if not os.path.exists(self.debug_folder):
                os.mkdir(self.debug_folder)
                print("New folder created: " + self.debug_folder)

            self.debug_log_file = self.debug_folder + 'log.txt'
            f = open(self.debug_log_file, 'w')
            f.close()

            self.logger = Logger(self.debug_log_file)
            sys.stdout = self.logger
Esempio n. 3
0
 def __init__(self):
     self.log = Logger(name='MLFramework')
     self.log.debug('ML Model Base init..%s' % self.__class__.__name__)
Esempio n. 4
0
class MLModelBase(metaclass=abc.ABCMeta):
    def __init__(self):
        self.log = Logger(name='MLFramework')
        self.log.debug('ML Model Base init..%s' % self.__class__.__name__)

    def doTraining(self, X, y, config):
        self.log.debug('%s doTraining ' % self.__class__.__name__)
        modelname = self.__class__.__name__
        h5File = "./model/" + modelname + "_{0}.h5".format(config.modelFileKey)
        modelFile = "./model/" + modelname + "_{0}.model".format(
            config.modelFileKey)
        feature_list = config._featureList
        if (os.path.isfile(h5File) and config.forceRetrain == False):
            print("training " + modelname + " :load model from file " + h5File)
            model = tf.keras.models.load_model(h5File)
        else:
            if (os.path.isfile(modelFile) and config.forceRetrain == False):
                print("training " + modelname + " :load model from file " +
                      modelFile)
                self.log.debug("training " + modelname +
                               ":load model from file " + modelFile)
                model = joblib.load(modelFile)
            else:
                print("training " + modelname + " :training model...")
                self.log.debug("training " + modelname + ":training model...")
                model = self.training(X, y)
                if hasattr(model, 'history'):
                    history = model.history
                    self.log.debug('{}-{}-history epochs:{}'.format(
                        modelname, config.modelFileKey,
                        len(history.history['loss'])))
                    model.summary(print_fn=lambda x: self.log.debug(x))
                if hasattr(model, 'clf'):
                    clf = model.clf
                    self.log.debug(
                        f"最佳準確率: {clf.best_score_},最佳參數組合:{clf.best_params_}")
                predicted = model.predict(X)
                r2 = metrics.r2_score(y, predicted)
                self.log.debug('{}-{}-模型績效..R2:{}'.format(
                    modelname, config.modelFileKey, r2))
                if hasattr(model, 'save'):
                    model.save(h5File)
                else:
                    joblib.dump(model, modelFile)
        ''' # 印出係數
        print(lm.coef_)
        # 印出截距
        print(lm.intercept_ )
        # 模型績效
        mse = np.mean((lm.predict(X) - y) ** 2)
        r_squared = lm.score(X, y)
        adj_r_squared = r_squared - (1 - r_squared) * (X.shape[1] / (X.shape[0] - X.shape[1] - 1))
        # 印出模型績效
        print('MSE:{0}'.format(mse))
        print('R2:{0}'.format(r_squared))
        print('adj_R2:{0}'.format(adj_r_squared))   '''

        if hasattr(model, 'feature_importances_'):
            # Get numerical feature importances
            importances = list(model.feature_importances_)
            # List of tuples with variable and importance
            feature_importances = [
                (feature, np.round(importance, 3))
                for feature, importance in zip(feature_list, importances)
            ]
            # Sort the feature importances by most important first
            feature_importances = sorted(feature_importances,
                                         key=lambda x: x[1],
                                         reverse=True)
            # Print out the feature and importances
            #[print('Variable: {:20} Importance: {}'.format(*pair)) for pair in feature_importances]
            df_feature_importances = DataFrame(feature_importances)
            df_feature_importances.columns = ['Variable', '重要性_' + modelname]
        elif hasattr(model, 'coef_'):
            # Get numerical feature importances
            importances = list(model.coef_)
            # List of tuples with variable and importance
            feature_importances = [
                (feature, abs(np.round(importance, 3)))
                for feature, importance in zip(feature_list, importances)
            ]
            # Sort the feature importances by most important first
            feature_importances = sorted(feature_importances,
                                         key=lambda x: x[1],
                                         reverse=True)
            # Print out the feature and importances
            #[print('Variable: {:20} Importance: {}'.format(*pair)) for pair in feature_importances]
            df_feature_importances = DataFrame(feature_importances)
            df_feature_importances.columns = ['Variable', '重要性_' + modelname]

            df_feature_importances['重要性_' + modelname] = (
                df_feature_importances['重要性_' + modelname] *
                df_feature_importances['重要性_' + modelname].std())
            df_feature_importances[
                '重要性_' + modelname] = df_feature_importances[
                    '重要性_' +
                    modelname] / df_feature_importances['重要性_' +
                                                        modelname].sum()
        else:
            df_feature_importances = DataFrame()

        return model, modelname, df_feature_importances

    def loadTestingModel(self, config):
        self.log.debug('%s doTesting' % self.__class__.__name__)
        modelname = self.__class__.__name__
        h5File = "./model/" + modelname + "_{0}.h5".format(config.modelFileKey)
        modelFile = "./model/" + modelname + "_{0}.model".format(
            config.modelFileKey)
        feature_list = config._featureList
        if (os.path.isfile(h5File)):
            print("training " + modelname + ":load model from file " + h5File)
            model = tf.keras.models.load_model(h5File)
        if (os.path.isfile(modelFile)):
            print("training " + modelname + ":load model from file " +
                  modelFile)
            model = joblib.load(modelFile)

        if hasattr(model, 'feature_importances_'):
            # Get numerical feature importances
            importances = list(model.feature_importances_)
            # List of tuples with variable and importance
            feature_importances = [
                (feature, np.round(importance, 3))
                for feature, importance in zip(feature_list, importances)
            ]
            # Sort the feature importances by most important first
            feature_importances = sorted(feature_importances,
                                         key=lambda x: x[1],
                                         reverse=True)
            # Print out the feature and importances
            #[print('Variable: {:20} Importance: {}'.format(*pair)) for pair in feature_importances]
            df_feature_importances = DataFrame(feature_importances)
            df_feature_importances.columns = ['Variable', '重要性_' + modelname]
        elif hasattr(model, 'coef_'):
            # Get numerical feature importances
            importances = list(model.coef_)
            # List of tuples with variable and importance
            feature_importances = [
                (feature, abs(np.round(importance, 3)))
                for feature, importance in zip(feature_list, importances)
            ]
            # Sort the feature importances by most important first
            feature_importances = sorted(feature_importances,
                                         key=lambda x: x[1],
                                         reverse=True)
            # Print out the feature and importances
            #[print('Variable: {:20} Importance: {}'.format(*pair)) for pair in feature_importances]
            df_feature_importances = DataFrame(feature_importances)
            df_feature_importances.columns = ['Variable', '重要性_' + modelname]

            df_feature_importances['重要性_' + modelname] = (
                df_feature_importances['重要性_' + modelname] *
                df_feature_importances['重要性_' + modelname].std())
            df_feature_importances[
                '重要性_' + modelname] = df_feature_importances[
                    '重要性_' +
                    modelname] / df_feature_importances['重要性_' +
                                                        modelname].sum()
        else:
            df_feature_importances = DataFrame()

        return model, modelname, df_feature_importances

    @abc.abstractmethod
    def training(self, X, y):
        return NotImplemented
Esempio n. 5
0
class Featurizer:
    log = Logger(name='MLFramework')

    @staticmethod
    def Select(df, targetfeat, config, limit=0.3):
        cateCols = config.encoderColumns
        '''
        #1. Using Pearson Correlation
        '''
        Featurizer.log.debug("Featurizer=====Using Pearson Correlation")
        # df = pd.concat(( df,  pd.get_dummies(df,columns=cateCols,prefix_sep='_') ), axis=1).drop(cateCols,1)
        df = pd.get_dummies(df, columns=cateCols, prefix_sep='_')
        cor = df.corr()

        #Correlation with output variable
        cor_target = abs(cor[targetfeat])
        #Selecting highly correlated features
        relevant_features = cor_target[cor_target > limit]
        Featurizer.log.debug(relevant_features.to_string())

        #1. Using Pearson Correlation
        cor = df.corr()

        #Correlation with output variable
        cor_target = abs(cor[targetfeat])
        #Selecting highly correlated features
        relevant_features = cor_target[cor_target > limit]
        Featurizer.log.debug("result============")
        Featurizer.log.debug('\n' + relevant_features.to_string())
        '''
        Wrapper Method:
        '''
        Featurizer.log.debug("Featurizer=====Using statsmodels OLS model")
        #Adding constant column of ones, mandatory for sm.OLS model
        X = df.drop(targetfeat, 1).drop(config.xAxisCol, 1)  #Feature Matrix
        y = df[targetfeat]  #Target Variable
        X_1 = sm.add_constant(X)
        #Fitting sm.OLS model

        model = sm.OLS(y, X_1).fit()
        Featurizer.log.debug("result============")
        Featurizer.log.debug(model.pvalues.to_string())
        '''
        # 2.Backward Elimination
        '''
        # cols = list(X.columns)
        # pmax = 1
        # while (len(cols)>0):

        #     p= []
        #     X_1 = X[cols]
        #     X_1 = sm.add_constant(X_1)
        #     print("len(cols) :",len(cols),X_1.shape)
        #     model = sm.OLS(y,X_1).fit()
        #     p = pd.Series(model.pvalues.values[:],index = cols)
        #     pmax = max(p)
        #     feature_with_p_max = p.idxmax()
        #     if(pmax>0.05):
        #         cols.remove(feature_with_p_max)
        #         print("cols.remove(feature_with_p_max):",feature_with_p_max, ' check:',feature_with_p_max in cols)
        #     else:
        #         break
        # selected_features_BE = cols
        # Featurizer.log.debug("result============")
        # Featurizer.log.debug(selected_features_BE)
        '''
        # 2.Embedded Method
        '''
        reg = LassoCV()
        reg.fit(X, y)
        Featurizer.log.debug("Best alpha using built-in LassoCV: %f" %
                             reg.alpha_)
        Featurizer.log.debug("Best score using built-in LassoCV: %f" %
                             reg.score(X, y))
        coef = pd.Series(reg.coef_, index=X.columns)
        Featurizer.log.debug("Lasso picked " + str(sum(coef != 0)) +
                             " variables and eliminated the other " +
                             str(sum(coef == 0)) + " variables")
        imp_coef = coef.sort_values(ascending=True)
        Featurizer.log.debug('Lasso coef \n' + imp_coef.to_string())
Esempio n. 6
0
File: MLBase.py Progetto: lokcyi/AI
 def __init__(self):
     self._config = MLConfig()
     self.log = Logger(name='MLFramework')
     self.log.debug('ML Base init..%s' % self.__class__.__name__)
Esempio n. 7
0
File: MLBase.py Progetto: lokcyi/AI
class MLBase(metaclass=abc.ABCMeta):
    ver = "MLFramework v0.01"

    def __init__(self):
        self._config = MLConfig()
        self.log = Logger(name='MLFramework')
        self.log.debug('ML Base init..%s' % self.__class__.__name__)

    @property
    def config(self):
        return self._config

    @config.setter
    def config(self, value):
        self._config = value

    # def get_next_toolg_wip(self, my_date,my_version,my_days,my_toolg=None):

    #     params_list ={'my_date':my_date,'my_version':my_version,'my_day   s':my_days}
    #     # query = "Exec PPM.dbo.sp_GenPredictData  %(my_date)s , %(my_version)s ,%(my_days)s "
    #     query = "Exec PPM.dbo.sp_GenPredictDataWeekly  %(my_date)s , %(my_version)s ,%(my_days)s "
    #     df = pd.read_sql(query, self.conn, params=params_list)
    #     self.conn.commit()
    #     if my_toolg is not None:
    #         df = df.loc[df['TOOLG_ID'].isin(my_toolg)]
    #         df = df.reset_index(drop=True)
    #     return df

    def getDataFromDB(self):
        """
        def 撈取DB
        """
        dataSource = self.config.dataSource[0]
        db_name = dataSource['DB']
        query = ['select * from %s ' % dataSource['TABLE']]

        if 'CONDITION' in dataSource.keys():
            if len(dataSource['CONDITION']) > 1:
                for i in range(len(dataSource['CONDITION'])):
                    if i == 0:
                        query.append('where ')
                    else:
                        query.append(' AND ')
                    if dataSource['CONDITION'][i]['operator'] == 'in':
                        query.append('  {} {}  (\'{}\')'.format(
                            dataSource['CONDITION'][i]['column'],
                            dataSource['CONDITION'][i]['operator'],
                            '\',\''.join(dataSource['CONDITION'][i]
                                         ['value'].split(','))))
                    else:
                        if dataSource['CONDITION'][i]['value'] == 'null' and (
                                dataSource['CONDITION'][i]['operator'] == '!='
                                or dataSource['CONDITION'][i]['operator']
                                == '=='):
                            query.append('  {} {}  {} '.format(
                                dataSource['CONDITION'][i]['column'],
                                dataSource['CONDITION'][i]['operator'],
                                dataSource['CONDITION'][i]['value']))
                        else:
                            query.append('  {} {}  \'{}\' '.format(
                                dataSource['CONDITION'][i]['column'],
                                dataSource['CONDITION'][i]['operator'],
                                dataSource['CONDITION'][i]['value']))
        conn = db_engine.DBEngine(db_name)
        self.dfInputData = conn.Query(' '.join(query))

        self.dfInputData.to_csv(self.config.datafile, index=False)

    def getMergeDataFile(self):
        self.dfInputData, self.strColumnlist, self.numbericColumnlist, self.nullColumnlist = Data.merge(
            self.config.dataFiles)
        self.dfInputData.to_csv(self.config.datafile, index=False)

    def getInputData(self):
        self.dfInputData, self.strColumnlist, self.numbericColumnlist, self.nullColumnlist = Data.readData(
            self.config.datafile)

    def filterData(self):
        self.dfInputData = Data.filterDataframe(self.dfInputData,
                                                self.config.InputDataCondition)
        # self.dfInputData,self.strColumnlist,self.numbericColumnlist,self.nullColumnlist=Data.readDataFrame(self.dfInputData)

    @abc.abstractmethod
    def dataTransform(self):
        return NotImplemented

    def filterColumns(self):
        self.dfInputData = Data.filterColumns(self.dfInputData, self.config)
        self.dfInputData, self.strColumnlist, self.numbericColumnlist, self.nullColumnlist = Data.analyzeData(
            self.dfInputData)
        if self.config.forceRetrain == True:
            self.dfTraining = Data.filterColumns(self.dfTraining, self.config)
        self.dfTesting = Data.filterColumns(self.dfTesting, self.config)

    # def scalerData(self):
    #     self.dfInputData=Data.scalerData(self.dfInputData,'MinMaxScaler',self.numbericColumnlist,self.config)
    #     print(self.dfInputData)

    # @abc.abstractmethod
    # def featureTransform(self):
    #     return NotImplemented

    @abc.abstractmethod
    def getTrainingData(self):
        return NotImplemented

    @abc.abstractmethod
    def getTestingData(self):
        return NotImplemented

    def genHTMLReport(self, template='template.html'):
        pd.set_option("display.precision", 3)
        htmlRender = {}
        ___acc = ''
        for i in range(len(self.config.runModel)):
            mClass = self.config.runModel[i]
            htmlRender['fitable{0}'.format(i + 1)] = (
                self.mFeatureImportances[mClass].style.render())
            if mClass != 'LSTMModel':
                htmlRender['sstable{0}'.format(i + 1)] = (
                    ModelAnalysis.sensitivityAnalysis(
                        self.model[mClass], self.mlKind[mClass],
                        self.dfInputData, self.config).style.render())
            # ___acc='{0} = {1},{2},{3}<br/>'.format(mClass,self.acc[mClass][0],self.acc[mClass][1],self.acc[mClass][2])

            if template != 'template.html':
                htmlRender['rawData{0}'.format(i + 1)] = (
                    self.showRows[mClass].style.render())

        htmlRender['accdata'] = (pd.DataFrame(
            self.acc,
            columns=list(self.acc.keys()),
            index=['acc', 'acc by axis', 'total acc']).style.render())

        htmlRender['ploimage'] = '{0}_plot.svg'.format(
            self.config.modelFileKey)

        htmlRender['nowDT'] = datetime.now().strftime("%Y/%m/%d %H:%M:%S")
        htmlRender['reportname'] = self.config.reportName
        # Template handling
        env = jinja2.Environment(loader=jinja2.FileSystemLoader(searchpath=''))
        template = env.get_template(template)
        html = template.render(htmlRender)
        #html = template.render(my_table="AAA")
        # Write the HTML file
        path = os.path.abspath('./Report/{0}_report.html'.format(
            self.config.modelFileKey))
        url = 'file://' + path
        with open(path, 'w') as f:
            f.write(html)
        webbrowser.open(url)

    def __getDATA(self):
        '''
        讀取DB 到local
        '''
        print(bcolors.HEADER + "===" + MLBase.ver +
              "===================================" + bcolors.ENDC)
        print(bcolors.WARNING + "===[input]讀取DB==================" +
              bcolors.ENDC)
        self.log.debug("===Data Merge===================%s" %
                       self.__class__.__name__)
        if hasattr(self.config, 'dataSource'):
            self.getDataFromDB()
        '''
        讀取資料合併
        '''
        print(bcolors.HEADER + "===" + MLBase.ver +
              "===================================" + bcolors.ENDC)
        print(bcolors.WARNING + "===[input]資料合併===================" +
              bcolors.ENDC)
        self.log.debug("===Data Merge===================%s" %
                       self.__class__.__name__)
        if hasattr(self.config, 'dataFiles'):
            self.getMergeDataFile()
        '''
        讀取資料csv (from self.config.datafile)
        '''

        print(bcolors.WARNING + "===[input]讀取資料===================" +
              bcolors.ENDC)
        self.log.debug("===Fetch Data===================%s" %
                       self.__class__.__name__)
        self.getInputData()
        print(bcolors.WARNING + "===[input]資料過濾===================" +
              bcolors.ENDC)
        self.log.debug("===Filter Input Data===================%s" %
                       self.__class__.__name__)
        if hasattr(self.config, 'InputDataCondition'):
            self.filterData()

        if self.dfInputData.shape[0] == 0:
            print(bcolors.WARNING + "資料筆數 : 0" + bcolors.ENDC)
            self.log.debug("資料筆數 : 0   Rows {}, Columns {}".format(
                self.dfInputData.shape[0], self.dfInputData.shape[1]))
            return
        print(bcolors.WARNING + "===[input]資料轉換===================" +
              bcolors.ENDC)
        self.log.debug("===Data Transform===================%s" %
                       self.__class__.__name__)
        self.dataTransform()
        '''
        訓練集 & 測試集
        '''

        print(bcolors.WARNING + "===[input]篩選訓練集 & 測試集================" +
              bcolors.ENDC)
        if hasattr(self.config, 'TrainCondition') & hasattr(
                self.config, 'TestCondition'):
            self.dfTraining = Data.filterDataframe(self.dfInputData,
                                                   self.config.TrainCondition)
            self.dfTesting = Data.filterDataframe(self.dfInputData,
                                                  self.config.TestCondition)
        else:
            self.dfTraining = self.getTrainingData()
            self.dfTesting = self.getTestingData()

        print(bcolors.WARNING + "===[input]過濾資料===================" +
              bcolors.ENDC)
        self.log.debug("===Data Filter===================%s" %
                       self.__class__.__name__)
        self.filterColumns()  # 拆分訓練集 測是集

    def EDAAnalysis(self):
        self.__getDATA()
        EDA.analysis(self.dfInputData, self.config.targetCol, self.config)

    def EDACompare(self):
        self.__getDATA()
        EDA.compare(self.dfTraining, self.dfTesting, self.config.targetCol,
                    self.config)

    def FeatureSelect(self):
        self.__getDATA()
        Featurizer.Select(self.dfInputData, self.config.targetCol, self.config)

    '''
    chekck 訓練集 測試集 有資料(如果筆數為0 則停止跑模型)
    '''

    def checkDFSetHasData(self):
        print(bcolors.WARNING + "資料筆數 : ({},{})".format(
            self.dfInputData.shape[0], self.dfInputData.shape[1]) +
              bcolors.ENDC)
        print(bcolors.WARNING + "Training Set  資料筆數 : ({},{})".format(
            self.dfTraining.shape[0], self.dfTraining.shape[1]) + bcolors.ENDC)
        print(bcolors.WARNING + "Testing Set   資料筆數 : ({},{})".format(
            self.dfTesting.shape[0], self.dfTesting.shape[1]) + bcolors.ENDC)
        if self.dfInputData.shape[0] == 0:
            self.log.debug("Input Set 資料筆數 : 0 ")
            return False
        if self.dfTraining.shape[0] == 0:

            self.log.debug("Training Set 資料筆數 : 0  ")
            return False
        if self.dfTesting.shape[0] == 0:

            self.log.debug("Testing Set 資料筆數 : 0 ")
            return False
        return True

    def run(self):
        if hasattr(self, 'initConfigSetting'):
            if callable(self.initConfigSetting):
                self.initConfigSetting()

        self.__getDATA()
        print(bcolors.WARNING + "===GET DATA END==================" +
              bcolors.ENDC)
        print(bcolors.WARNING + "dfTraining describe----------------" +
              bcolors.ENDC)
        self.log.debug('\n' + self.dfTraining.describe().to_string())
        print(bcolors.WARNING + "dfTesting describe----------------" +
              bcolors.ENDC)
        self.log.debug('\n' + self.dfTesting.describe().to_string())
        '''
        資料預處理
        '''

        print(bcolors.WARNING + "===填補遺漏值==================" + bcolors.ENDC)
        self.log.debug("===填補遺漏值==================%s" %
                       self.__class__.__name__)
        # self.fillnull()
        self.dfTraining = Data.fillnull(self.dfTraining, self.nullColumnlist,
                                        self.config.fillNaType.value)
        self.dfTesting = Data.fillnull(self.dfTesting, self.nullColumnlist,
                                       self.config.fillNaType.value)
        self.dfOriTesting = self.dfTesting.copy(deep=False)
        self.dfTraining.to_csv("./Report/" + self.config.modelFileKey +
                               '_Training.csv')
        if not self.checkDFSetHasData():
            return

        print(bcolors.WARNING + "===特徵縮放===================" + bcolors.ENDC)
        self.log.debug("===特徵縮放===================%s" %
                       self.__class__.__name__)
        # self.scalerData()
        if not (hasattr(self.config, 'scalerKind')
                or hasattr(self.config, 'muiltiScalerKind')):
            self.config.scalerKind = scalerKind.MINMAX
        if hasattr(self.config, 'scalerKind'):
            self.dfTraining = Data.scalerData(self.dfTraining,
                                              self.config.scalerKind.value,
                                              self.numbericColumnlist,
                                              self.config,
                                              isTrain=True)
            self.dfTesting = Data.scalerData(self.dfTesting,
                                             self.config.scalerKind.value,
                                             self.numbericColumnlist,
                                             self.config,
                                             isTrain=False)
        elif hasattr(self.config, 'muiltiScalerKind'):
            self.dfTraining = Data.multiScalerData(self.dfTraining,
                                                   self.numbericColumnlist,
                                                   self.config,
                                                   isTrain=True)
            self.dfTesting = Data.multiScalerData(self.dfTesting,
                                                  self.numbericColumnlist,
                                                  self.config,
                                                  isTrain=False)

        print(bcolors.WARNING + "===特徵轉換===================" + bcolors.ENDC)
        self.log.debug("===特徵轉換===================%s" %
                       self.__class__.__name__)
        # self.featureTransform()
        # self.dfInputDataRaw=  self.dfTraining.copy(deep=False)

        self.dfTraining_eh = Data.featureTransform(
            self.dfTraining, self.config, True)  # exclude target_cols xAxisCol
        self.dfTraining_eh.to_csv("./log/" + self.config.modelFileKey +
                                  '_Training_eh.csv')
        self.dfTesting_eh = Data.featureTransform(
            self.dfTesting, self.config, False)  # exclude target_cols xAxisCol
        self.dfTesting_eh.to_csv("./log/" + self.config.modelFileKey +
                                 '_Testing_eh.csv')
        # self.dfTraining = self.getTrainingData()
        # if hasattr(self.config, 'TrainCondition') and hasattr(self.config, 'TestCondition'):
        #     cols = [ sub['column'] for sub in self.config.TrainCondition+self.config.TestCondition ]
        #     cols = [k for k, g in groupby(sorted(cols))]
        #     self.dfTraining= self.dfTraining.drop(columns=cols)

        self.dfInputData = self.dfTraining_eh

        print(bcolors.WARNING + "===Ready for Training===================" +
              bcolors.ENDC)
        self.log.debug("===Ready for Training===================%s" %
                       self.__class__.__name__)
        # self.dfTraining_eh= self.dfTraining_eh.drop([x for x in [self.config.xAxisCol] if x in self.dfTraining_eh.columns], axis=1)
        self.X = np.asarray(self.dfTraining_eh)
        self.y = np.asarray(self.dfTraining[self.config.targetCol])

        print(bcolors.WARNING + "===Ready for Testing===================" +
              bcolors.ENDC)
        self.log.debug("===Ready for Testing===================%s" %
                       self.__class__.__name__)
        # self.dfOriTesting = self.getTestingData()
        # self.dfTesting =  self.dfOriTesting.copy(deep=False)
        # self.dfTesting_eh = self.dfTesting_eh.drop([x for x in [self.config.xAxisCol] if x in self.dfTesting_eh.columns], axis=1)
        self.XTest = np.asarray(self.dfTesting_eh)
        '''
        模型訓練
        '''
        print(bcolors.OKBLUE + "===訓練模型====================" + bcolors.ENDC)
        self.log.debug("===Model Training===================%s" %
                       self.__class__.__name__)
        self.config._featureList = list(self.dfTraining_eh.columns)
        print(bcolors.WARNING + "_featureList : " +
              ''.join(self.config._featureList) + bcolors.ENDC)
        self.log.debug("_featureList : {} \n".format(' , '.join(
            self.config._featureList)))
        #self.config._featureList=list(self.dfTraining.drop(self.config.targetCol, axis=1).columns)
        self.model = {}
        self.mlKind = {}
        self.mFeatureImportances = {}
        self.acc = {}
        for i in range(len(self.config.runModel)):
            mClass = self.config.runModel[i]
            mObj = getattr(globals()[mClass], mClass)()
            if mClass == 'LSTMModel':
                self.X = np.reshape(self.X,
                                    (self.X.shape[0], self.X.shape[1], 1))

            for i in range(1, 5):

                self.model[mClass], self.mlKind[
                    mClass], self.mFeatureImportances[
                        mClass] = mObj.doTraining(self.X, self.y, self.config)
                _validation_config = copy.deepcopy(self.config)
                _validation_config.modelFileKey = _validation_config.modelFileKey + "_Val"
                _acc, _, _, _ = Data.testModel(self.X, self.model[mClass],
                                               self.mlKind[mClass],
                                               self.dfTraining,
                                               _validation_config)
                if _acc > 40:
                    self.log.debug(
                        "doTraining>0.6 ---- No.{} => {} PASS.".format(
                            i, _acc))
                    break

        print(bcolors.OKBLUE + "===測試模型====================" + bcolors.ENDC)
        self.log.debug("===Model Testing===================%s" %
                       self.__class__.__name__)
        '''
        模型測試
        '''
        plt.style.use('ggplot')
        plt.figure(figsize=(20, 6 * len(self.config.runModel)), dpi=60)
        for i in range(len(self.config.runModel)):
            plt.subplot(len(self.config.runModel) * 100 + 10 + 1 + i)
            mClass = self.config.runModel[i]
            if mClass == 'LSTMModel':
                self.XTest = np.reshape(
                    self.XTest, (self.XTest.shape[0], self.XTest.shape[1], 1))
            _acc, _accsum, _totol_acc, _ = Data.testModel(
                self.XTest, self.model[mClass], self.mlKind[mClass],
                self.dfOriTesting, self.config)
            self.acc[mClass] = [_acc, _accsum, _totol_acc]
        plt.tight_layout()
        plt.savefig('./Report/{0}_plot.svg'.format(self.config.modelFileKey))
        '''
        產生報表
        '''
        print(bcolors.OKBLUE + "===產生報表====================" + bcolors.ENDC)
        self.log.debug("===Create Report===================%s" %
                       self.__class__.__name__)
        self.genHTMLReport()

    def runPredict(self):
        if hasattr(self, 'initConfigSetting'):
            if callable(self.initConfigSetting):
                self.initConfigSetting()
        self.__getDATA()
        '''
        資料預處理
        '''
        print(bcolors.WARNING + "===填補遺漏值==================" + bcolors.ENDC)
        self.log.debug("===填補遺漏值==================%s" %
                       self.__class__.__name__)
        # self.fillnull()
        # self.dfTraining = Data.fillnull(self.dfTraining, self.nullColumnlist, self.config.fillNaType.value)
        self.dfTesting = Data.fillnull(self.dfTesting, self.nullColumnlist,
                                       self.config.fillNaType.value)
        self.dfOriTesting = self.dfTesting.copy(deep=True)

        print(bcolors.WARNING + "===特徵縮放===================" + bcolors.ENDC)
        self.log.debug("===特徵縮放===================%s" %
                       self.__class__.__name__)

        if not hasattr(self.config, 'scalerKind'):
            self.config.scalerKind = scalerKind.MINMAX
        # self.dfTraining = Data.scalerData(self.dfTraining, self.config.scalerKind.value,self.numbericColumnlist,self.config, isTrain=True)
        self.dfTesting = Data.scalerData(self.dfTesting,
                                         self.config.scalerKind.value,
                                         self.numbericColumnlist,
                                         self.config,
                                         isTrain=False)

        print(bcolors.WARNING + "===特徵轉換===================" + bcolors.ENDC)
        self.log.debug("===特徵轉換===================%s" %
                       self.__class__.__name__)
        # self.featureTransform()
        # self.dfInputDataRaw=  self.dfTraining.copy(deep=False)
        # self.dfTraining_eh = Data.featureTransform(self.dfTraining, self.config,True)  # exclude target_cols xAxisCol
        # self.dfTraining_eh.to_csv("./log/"+self.config.modelFileKey+'_Training.csv')
        self.dfTesting_eh = Data.featureTransform(
            self.dfTesting, self.config, False)  # exclude target_cols xAxisCol
        self.dfTesting_eh.to_csv("./report/" + self.config.modelFileKey +
                                 '_Testing.csv')

        print(bcolors.WARNING + "===Ready for Testing===================" +
              bcolors.ENDC)
        self.log.debug("===Ready for Testing===================%s" %
                       self.__class__.__name__)
        self.XTest = np.asarray(self.dfTesting_eh)
        self.dfInputData = self.dfTesting_eh
        '''
        模型訓練
        '''
        print(bcolors.OKBLUE + "===訓練模型====================" + bcolors.ENDC)
        self.log.debug("===Model Training===================%s" %
                       self.__class__.__name__)
        self.config._featureList = list(self.dfTesting_eh.columns)
        print(bcolors.WARNING + "_featureList : " +
              ''.join(self.config._featureList) + bcolors.ENDC)
        self.log.debug("_featureList : {} \n".format(' , '.join(
            self.config._featureList)))
        #self.config._featureList=list(self.dfTraining.drop(self.config.targetCol, axis=1).columns)
        self.model = {}
        self.mlKind = {}
        self.mFeatureImportances = {}
        self.acc = {}
        self.showRows = {}
        # for i in range(len(self.config.runModel)):
        #     mClass=self.config.runModel[i]
        #     mObj = getattr(globals()[mClass], mClass)()
        #     if mClass =='LSTMModel':
        #         self.X = np.reshape(self.X, (self.X.shape[0], self.X.shape[1], 1))
        #     self.model[mClass], self.mlKind[mClass], self.mFeatureImportances[mClass] = mObj.doTraining(self.X, self.y, self.config)

        # print(bcolors.OKBLUE + "===測試模型====================" + bcolors.ENDC)
        self.log.debug("===Model Testing===================%s" %
                       self.__class__.__name__)
        '''
        模型測試
        '''
        plt.style.use('ggplot')
        plt.figure(figsize=(20, 6 * len(self.config.runModel)), dpi=60)
        for i in range(len(self.config.runModel)):
            plt.subplot(len(self.config.runModel) * 100 + 10 + 1 + i)
            mClass = self.config.runModel[i]
            mObj = getattr(globals()[mClass], mClass)()
            if mClass == 'LSTMModel':
                self.XTest = np.reshape(
                    self.XTest, (self.XTest.shape[0], self.XTest.shape[1], 1))
            self.model[mClass], self.mlKind[mClass], self.mFeatureImportances[
                mClass] = mObj.loadTestingModel(self.config)
            _acc, _accsum, _totol_acc, _showRow = Data.testModel(
                self.XTest, self.model[mClass], self.mlKind[mClass],
                self.dfOriTesting, self.config)
            self.acc[mClass] = [_acc, _accsum, _totol_acc]
            _showRow.insert(loc=0, column='model', value=mClass)
            self.showRows[mClass] = _showRow
        plt.tight_layout()
        plt.savefig('./Report/{0}_plot.svg'.format(self.config.modelFileKey))
        '''
        產生報表
        '''
        print(bcolors.OKBLUE + "===產生報表====================" + bcolors.ENDC)
        self.log.debug("===Create Report===================%s" %
                       self.__class__.__name__)
        self.genHTMLReport('template2.html')
Esempio n. 8
0
class MLBase(metaclass=abc.ABCMeta):
    ver = "MLFramework v0.01"

    def __init__(self):
        self._config = MLConfig()
        self.log = Logger(name='MLFramework')
        self.log.debug('ML Base init..%s' % self.__class__.__name__)

    @property
    def config(self):
        return self._config

    @config.setter
    def config(self, value):
        self._config = value

    def getMergeDataFile(self):
        self.dfInputData, self.strColumnlist, self.numbericColumnlist, self.nullColumnlist = Data.merge(
            self.config.dataFiles)
        self.dfInputData.to_csv(self.config.datafile, index=False)

    def getInputData(self):
        self.dfInputData, self.strColumnlist, self.numbericColumnlist, self.nullColumnlist = Data.readData(
            self.config.datafile)

    def filterData(self):
        self.dfInputData = Data.filterDataframe(self.dfInputData,
                                                self.config.InputDataCondition)
        self.dfInputData, self.strColumnlist, self.numbericColumnlist, self.nullColumnlist = Data.readDataFrame(
            self.dfInputData)

    @abc.abstractmethod
    def dataTransform(self):
        return NotImplemented

    def filterColumns(self):
        self.dfInputData = Data.filterColumns(self.dfInputData, self.config)
        self.dfInputData, self.strColumnlist, self.numbericColumnlist, self.nullColumnlist = Data.analyzeData(
            self.dfInputData)

    def fillnull(self):
        if hasattr(self.config, 'fillNaType'):
            if (self.config.fillNaType.value == 'mean'):
                self.dfInputData[self.nullColumnlist] = self.dfInputData[
                    self.nullColumnlist].fillna(
                        self.dfInputData.median()).fillna(value=0)
            elif (self.fillNaType.value == 'mode'):
                self.dfInputData = self.dfInputData.fillna(
                    self.dfInputData.mode())
            elif (self.fillNaType.value == 'bfill'):
                self.dfInputData = self.dfInputData.fillna(
                    method='bfill').fillna(self.dfInputData.median())
            elif (self.fillNaType.value == 'ffill'):
                self.dfInputData = self.dfInputData.fillna(
                    method='ffill').fillna(self.dfInputData.median())
            elif (self.fillNaType.value == 'dropna'):
                self.dfInputData = self.dfInputData.dropna()
            elif (self.fillNaType.value == 'zero'):
                self.dfInputData[self.nullColumnlist] = self.dfInputData[
                    self.nullColumnlist].fillna(0)
        else:
            self.dfInputData[self.nullColumnlist] = self.dfInputData[
                self.nullColumnlist].fillna(
                    self.dfInputData.median()).fillna(value=0)

    def scalerData(self):
        self.dfInputData = Data.scalerData(self.dfInputData, 'MinMaxScaler',
                                           self.numbericColumnlist,
                                           self.config)
        print(self.dfInputData)

    @abc.abstractmethod
    def featureTransform(self):
        return NotImplemented

    @abc.abstractmethod
    def getTrainingData(self):
        return NotImplemented

    @abc.abstractmethod
    def getTestingData(self):
        return NotImplemented

    def genHTMLReport(self):
        pd.set_option("display.precision", 3)
        htmlRender = {}
        for i in range(len(self.config.runModel)):
            mClass = self.config.runModel[i]
            htmlRender['fitable{0}'.format(i + 1)] = (
                self.mFeatureImportances[mClass].style.render())
            if mClass != 'LSTMModel':
                htmlRender['sstable{0}'.format(i + 1)] = (
                    ModelAnalysis.sensitivityAnalysis(
                        self.model[mClass], self.mlKind[mClass],
                        self.dfInputData, self.config).style.render())
        htmlRender['ploimage'] = '{0}_plot.svg'.format(
            self.config.modelFileKey)
        # Template handling
        env = jinja2.Environment(loader=jinja2.FileSystemLoader(searchpath=''))
        template = env.get_template('template.html')
        html = template.render(htmlRender)
        #html = template.render(my_table="AAA")
        # Write the HTML file
        path = os.path.abspath('./Report/{0}_report.html'.format(
            self.config.modelFileKey))
        url = 'file://' + path
        with open(path, 'w') as f:
            f.write(html)
        webbrowser.open(url)

    def run(self):
        print(bcolors.HEADER + "===" + MLBase.ver +
              "===================================" + bcolors.ENDC)
        print(bcolors.WARNING + "===資料合併===================" + bcolors.ENDC)
        self.log.debug("===Data Merge===================%s" %
                       self.__class__.__name__)
        if hasattr(self.config, 'dataFiles'):
            self.getMergeDataFile()
        print(bcolors.WARNING + "===讀取資料===================" + bcolors.ENDC)
        self.log.debug("===Fetch Data===================%s" %
                       self.__class__.__name__)
        self.getInputData()

        print(bcolors.WARNING + "===資料過濾===================" + bcolors.ENDC)
        self.log.debug("===Filter Input Data===================%s" %
                       self.__class__.__name__)
        if hasattr(self.config, 'InputDataCondition'):
            self.filterData()

        print(bcolors.WARNING + "===資料轉換===================" + bcolors.ENDC)
        self.log.debug("===Data Transform===================%s" %
                       self.__class__.__name__)
        self.dataTransform()

        print(bcolors.WARNING + "===過濾資料===================" + bcolors.ENDC)
        self.log.debug("===Data Filter===================%s" %
                       self.__class__.__name__)
        self.filterColumns()

        print(bcolors.WARNING + "===填補遺漏值==================" + bcolors.ENDC)
        self.log.debug("===fill None==================%s" %
                       self.__class__.__name__)
        self.fillnull()
        self.dfInputData.info()
        print(bcolors.WARNING + "===特徵縮放===================" + bcolors.ENDC)
        self.log.debug("===scale===================%s" %
                       self.__class__.__name__)
        self.scalerData()
        print(bcolors.WARNING + "===特徵轉換===================" + bcolors.ENDC)
        self.log.debug("===feature Transform===================%s" %
                       self.__class__.__name__)
        self.featureTransform()
        self.dfInputData.info()
        print(bcolors.WARNING + "===準備訓練資料================" + bcolors.ENDC)
        self.log.debug("===Ready for Training===================%s" %
                       self.__class__.__name__)
        self.dfTraining = self.getTrainingData()
        # if hasattr(self.config, 'TrainCondition') and hasattr(self.config, 'TestCondition'):
        #     cols = [ sub['column'] for sub in self.config.TrainCondition+self.config.TestCondition ]
        #     cols = [k for k, g in groupby(sorted(cols))]
        #     self.dfTraining= self.dfTraining.drop(columns=cols)

        self.dfTraining = self.dfTraining.drop([
            x for x in [self.config.xAxisCol] if x in self.dfTraining.columns
        ],
                                               axis=1)
        self.X = np.asarray(self.dfTraining.drop(self.config.targetCol,
                                                 axis=1))
        self.y = np.asarray(self.dfTraining[self.config.targetCol])

        print(bcolors.WARNING + "===準備測試資料================" + bcolors.ENDC)
        self.log.debug("===Ready for Testing===================%s" %
                       self.__class__.__name__)
        self.dfOriTesting = self.getTestingData()
        self.dfTesting = self.dfOriTesting.copy(deep=False)
        self.dfTesting = self.dfTesting.drop(
            [x for x in [self.config.xAxisCol] if x in self.dfTesting.columns],
            axis=1)
        self.XTest = np.asarray(
            self.dfTesting.drop(self.config.targetCol, axis=1))

        print(bcolors.OKBLUE + "===訓練模型====================" + bcolors.ENDC)
        self.log.debug("===Model Training===================%s" %
                       self.__class__.__name__)
        self.config._featureList = list(
            self.dfTraining.drop(self.config.targetCol, axis=1).columns)
        self.model = {}
        self.mlKind = {}
        self.mFeatureImportances = {}
        for i in range(len(self.config.runModel)):
            mClass = self.config.runModel[i]
            mObj = getattr(globals()[mClass], mClass)()
            if mClass == 'LSTMModel':
                self.X = np.reshape(self.X,
                                    (self.X.shape[0], self.X.shape[1], 1))
            self.model[mClass], self.mlKind[mClass], self.mFeatureImportances[
                mClass] = mObj.doTraining(self.X, self.y, self.config)

        print(bcolors.OKBLUE + "===測試模型====================" + bcolors.ENDC)
        self.log.debug("===Model Testing===================%s" %
                       self.__class__.__name__)

        plt.style.use('ggplot')
        plt.figure(figsize=(20, 6 * len(self.config.runModel)), dpi=60)
        for i in range(len(self.config.runModel)):
            plt.subplot(len(self.config.runModel) * 100 + 10 + 1 + i)
            mClass = self.config.runModel[i]
            if mClass == 'LSTMModel':
                self.XTest = np.reshape(
                    self.XTest, (self.XTest.shape[0], self.XTest.shape[1], 1))
            Data.testModel(self.XTest, self.model[mClass], self.mlKind[mClass],
                           self.dfOriTesting, self.config)

        plt.tight_layout()
        plt.savefig('./Report/{0}_plot.svg'.format(self.config.modelFileKey))
        print(bcolors.OKBLUE + "===產生報表====================" + bcolors.ENDC)
        self.log.debug("===Create Report===================%s" %
                       self.__class__.__name__)
        self.genHTMLReport()
Esempio n. 9
0
class Data:
    log = Logger(name='MLFramework')

    @staticmethod
    def readData(inputfile):
        Data.log.debug('readData ==> %s' % inputfile)
        df = pd.read_csv(inputfile)
        df = df.dropna(axis=1, how='all')
        df.info()
        return Data.analyzeData(df)

    @staticmethod
    def readDataFrame(df):
        df = df.dropna(axis=1, how='all')
        return Data.analyzeData(df)

    @staticmethod
    def merge(dataFiles):
        index = 0
        for dfFile in dataFiles['files']:
            print(dfFile)
            if index == 0:
                _dfInputData1, _strColumnlist1, _numbericColumnlist1, _nullColumnlist1 = Data.readData(
                    dfFile)

                # _df_result.
            else:
                datasetRels = dataFiles['relations'][index - 1]
                _dfInputData2, _strColumnlist2, _numbericColumnlist2, _nullColumnlist2 = Data.readData(
                    dfFile)
                _dfInputData1.set_index(datasetRels[0])
                _dfInputData2.set_index(datasetRels[1])
                df_merge = Data.mergeDataFrame(_dfInputData1, _dfInputData2,
                                               datasetRels[0], datasetRels[1])
                _dfInputData1 = df_merge.copy(deep=False)
            index += 1

        return Data.analyzeData(df_merge)

    @staticmethod
    def mergeDataFrame(dfleft, dfright, LeftKeys, RightKeys):
        # dfright.columns = [str(col) + '_'+joinTableName for col in df.columns]
        df_merge = pd.merge(dfleft,
                            dfright,
                            left_on=LeftKeys,
                            right_on=RightKeys,
                            how="inner")
        return df_merge

    @staticmethod
    def analyzeData(df):

        print('非數值欄位:')
        strColumnlist = df.select_dtypes(
            exclude=['int64', 'float64']).columns.tolist()
        print(strColumnlist)
        print('數值欄位:')
        numbericColumnlist = df.select_dtypes(
            include=['int64', 'float64']).columns.tolist()
        print(numbericColumnlist)
        print('包含NULL的欄位:')
        nullColumnlist = df.columns[df.isna().any()].tolist()
        print(nullColumnlist)
        print('計算NULL筆:')
        print(
            pd.DataFrame({
                'COUNT':
                df.isnull().sum(),
                'Missing Ratio': (df.isnull().sum() * 100 / df.shape[0])
            }))

        Data.log.debug('\n' + pd.DataFrame({
            'COUNT':
            df.isnull().sum(),
            'Missing Ratio': (df.isnull().sum() * 100 / df.shape[0])
        }).to_string())
        Data.log.debug('\n' + df.describe().to_string())
        print('===================================================')
        return df, strColumnlist, numbericColumnlist, nullColumnlist

    @staticmethod
    def filterDataframe(df, condition):
        for c in condition:
            if c['operator'] == "=":
                df = df[df[c['column']] == c['value']]
            elif c['operator'] == "=!" or c['operator'] == "!=":
                df = df[df[c['column']] != c['value']]
            elif c['operator'] == "<=":
                df = df[df[c['column']] <= c['value']]
            elif c['operator'] == "<":
                df = df[df[c['column']] < c['value']]
            elif c['operator'] == ">=":
                df = df[df[c['column']] >= c['value']]
            elif c['operator'] == ">":
                df = df[df[c['column']] > c['value']]
        return df

    @staticmethod
    def fillnull(df, nullColumnlist, fillType):
        if (fillType == 'mean'):
            # df[nullColumnlist] = df[nullColumnlist].fillna(df.median()).fillna(value=0)
            df = df.fillna(df.median()).fillna(value=0)
        elif (fillType == 'mode'):
            df = df.fillna(df.mode())
        elif (fillType == 'bfill'):
            df = df.fillna(method='bfill').fillna(df.median())
        elif (fillType == 'ffill'):
            df = df.fillna(method='ffill').fillna(df.median())
        elif (fillType == 'dropna'):
            df = df.dropna()
        elif (fillType == 'zero'):
            df = df.fillna(0)
        return df

    @staticmethod
    def filterColumns(df, config):
        try:
            includeColumns = config.includeColumns
            excludeColumns = config.excludeColumns
            if (len(includeColumns) > 0):
                df = df[includeColumns]
            if (len(excludeColumns) > 0):
                df = df.drop(columns=excludeColumns)
            return df
        except:
            Data.log.error("filterColumns error:", sys.exc_info()[0])
            #Data.log.error(error)
            raise  # just this!
            # raise AppError      # Don't do this, you'll lose the stack trace!

    @staticmethod
    def scalerData(df, scalerKind, numbericColumnlist, config, isTrain=True):
        if len(numbericColumnlist) > 0:
            target_cols = config.targetCol
            scalerColumnlist = [
                ele for ele in numbericColumnlist if ele not in target_cols
            ]
            scalerColumnlist = list(
                set(scalerColumnlist).intersection(df.columns))
            if isTrain:
                scaler = None
                if (scalerKind == 'standard'):
                    scaler = StandardScaler()
                elif (scalerKind == 'minmax'):
                    scaler = MinMaxScaler()
                elif (scalerKind == 'MinMaxScaler'):
                    scaler = MinMaxScaler()
                elif (scalerKind == 'robust'):
                    scaler = RobustScaler()
                elif (scalerKind == 'maxabs'):
                    scaler = MaxAbsScaler()
                elif (scalerKind == 'normal'):
                    scaler = Normalizer()
                else:
                    scaler = MinMaxScaler()
                scaler.fit(df[scalerColumnlist])
                df[scalerColumnlist] = scaler.transform(df[scalerColumnlist])
                dump(
                    scaler,
                    open('model/scaler_{}.pkl'.format(config.modelFileKey),
                         'wb'))
            else:
                scaler = load(
                    open('model/scaler_{}.pkl'.format(config.modelFileKey),
                         'rb'))
                df[scalerColumnlist] = scaler.transform(df[scalerColumnlist])
        return df

    @staticmethod
    def featureTransform(df, config, isTrain=True):
        if len(config.encoderColumns) > 0:
            target_cols = config.targetCol

            df = pd.get_dummies(df.drop([target_cols, config.xAxisCol],
                                        axis=1),
                                columns=config.encoderColumns,
                                prefix_sep='_')
            if isTrain:
                df.head(0).to_csv('model/eh_{}.csv'.format(
                    config.modelFileKey),
                                  index=0)  #不保存行索引
            else:
                df_eh = pd.read_csv('model/eh_{}.csv'.format(
                    config.modelFileKey))  #不保存行索引
                df = df.reindex(columns=df_eh.columns, fill_value=0)
                # Ensure the order of column in the test set is in the same order than in train set
                df = df[df_eh.columns]

        return df

    @staticmethod
    def accsum(def_result, target_cols):
        _accsum = 0
        for index, row in def_result.iterrows():
            #避免當分母為0 會無法計算
            if row[target_cols] == 0 and row['Predict'] == 0:
                row[target_cols] = 1
                row['Predict'] = 1
            elif row[target_cols] == 0 and row['Predict'] != 0:
                row[target_cols] = 0.00001

            if row[target_cols] < 0:
                row[target_cols] = 0.00001

            if row['Predict'] < 0:
                row['Predict'] = 0

            if 1 - abs(
                (row['Predict'] - row[target_cols]) / row[target_cols]) > 0:
                _accsum += (1 - abs(
                    (row['Predict'] - row[target_cols]) / row[target_cols]))

        return round(_accsum * 100 / def_result.shape[0], 2)

    @staticmethod
    def testModel(XTest, model, mlKind, dfOri, config):
        yTest = model.predict(XTest)
        df2 = dfOri.copy(deep=False)
        df2.insert(len(df2.columns), 'Predict', yTest)
        plt.title((mlKind + ":{0}%").format(Data.accsum(df2,
                                                        config.targetCol)))
        plt.xlabel(config.xAxisCol)
        plt.xticks(rotation=90)
        plt.ylabel(config.targetCol)
        df2 = df2.sort_values(config.xAxisCol, ascending=True)
        t = df2[config.xAxisCol].to_numpy() + '_' + np.arange(
            len(XTest)).astype(str)  # 创建t变量
        plt.plot(t,
                 df2['Predict'],
                 label=mlKind,
                 color='red',
                 marker='.',
                 linewidth='0.5')
        plt.plot(t,
                 df2[config.targetCol],
                 label="ACT",
                 color='blue',
                 marker='x',
                 linewidth='0')
        plt.legend()
        plt.ylim(bottom=0)
        df2.to_csv('./Report/' + config.modelFileKey + '_' + mlKind + '.csv',
                   index=False)

        _acc = mlKind, Data.accsum(df2, config.targetCol)
        print(mlKind + '  ' + config.modelFileKey +
              " Test acc : %.2f" % _acc[1])
        Data.log.debug(mlKind + '  ' + config.modelFileKey +
                       " Test acc: %.2f" % _acc[1])

        _accsum = 0
        def_result_summary = df2.groupby(config.xAxisCol,
                                         as_index=False).sum().reset_index()[[
                                             config.xAxisCol, config.targetCol,
                                             'Predict'
                                         ]]
        if (def_result_summary.shape[0] > 1):
            _acc = mlKind, Data.accsum(def_result_summary, config.targetCol)
            Data.log.debug(mlKind + '  ' + config.modelFileKey +
                           " Test group by x-axis acc: %.2f" % _acc[1])
            print(mlKind + " Test group by x-axis acc:  %.2f" % _acc[1])

        def_result_summary = df2[[config.targetCol, 'Predict']].sum()
        if (def_result_summary[config.targetCol] != 0):
            totol_acc = (1 - abs(def_result_summary['Predict'] -
                                 def_result_summary[config.targetCol]) /
                         def_result_summary[config.targetCol]) * 100
            print(mlKind + " Test Aggreation acc : %.2f " % totol_acc)
            Data.log.debug(mlKind + '  ' + config.modelFileKey +
                           " Test Aggreation acc : %.2f " % totol_acc)
            dfraw = pd.read_csv(config.datafile)