class EDA: log = Logger(name='MLFramework') @staticmethod def analysis(df, targetfeat, config): pairwise_analysis = 'on' #相關性和其他型別的資料關聯可能需要花費較長時間。如果超過了某個閾值,就需要設定這個引數為on或者off,以判斷是否需要分析資料相關性。 report_train = sv.analyze([df, 'train'], target_feat=targetfeat, pairwise_analysis=pairwise_analysis) report_train.show_html( filepath='./report/{}_AnalysisReport.html'.fomrmat( config.modelFileKey)) # 儲存為html的格式 @staticmethod def compare(df_train, df_test, targetfeat, config): pairwise_analysis = 'on' #相關性和其他型別的資料關聯可能需要花費較長時間。如果超過了某個閾值,就需要設定這個引數為on或者off,以判斷是否需要分析資料相關性。 compare_subsets_report = sv.compare( [df_train, 'Train'], # 使用compare [df_test, 'Test'], target_feat=targetfeat, pairwise_analysis=pairwise_analysis) compare_subsets_report.show_html( filepath='./report/{}_CompareReport.html'.format( config.modelFileKey))
def toggle_debug_mode(self, enabled): self.debug_mode = enabled if enabled: self.debug_folder = DEBUG_FOLDER_ROOT + '/' + str( time.time()) + '/' if not os.path.exists(self.debug_folder): os.mkdir(self.debug_folder) print("New folder created: " + self.debug_folder) self.debug_log_file = self.debug_folder + 'log.txt' f = open(self.debug_log_file, 'w') f.close() self.logger = Logger(self.debug_log_file) sys.stdout = self.logger
def __init__(self): self.log = Logger(name='MLFramework') self.log.debug('ML Model Base init..%s' % self.__class__.__name__)
class MLModelBase(metaclass=abc.ABCMeta): def __init__(self): self.log = Logger(name='MLFramework') self.log.debug('ML Model Base init..%s' % self.__class__.__name__) def doTraining(self, X, y, config): self.log.debug('%s doTraining ' % self.__class__.__name__) modelname = self.__class__.__name__ h5File = "./model/" + modelname + "_{0}.h5".format(config.modelFileKey) modelFile = "./model/" + modelname + "_{0}.model".format( config.modelFileKey) feature_list = config._featureList if (os.path.isfile(h5File) and config.forceRetrain == False): print("training " + modelname + " :load model from file " + h5File) model = tf.keras.models.load_model(h5File) else: if (os.path.isfile(modelFile) and config.forceRetrain == False): print("training " + modelname + " :load model from file " + modelFile) self.log.debug("training " + modelname + ":load model from file " + modelFile) model = joblib.load(modelFile) else: print("training " + modelname + " :training model...") self.log.debug("training " + modelname + ":training model...") model = self.training(X, y) if hasattr(model, 'history'): history = model.history self.log.debug('{}-{}-history epochs:{}'.format( modelname, config.modelFileKey, len(history.history['loss']))) model.summary(print_fn=lambda x: self.log.debug(x)) if hasattr(model, 'clf'): clf = model.clf self.log.debug( f"最佳準確率: {clf.best_score_},最佳參數組合:{clf.best_params_}") predicted = model.predict(X) r2 = metrics.r2_score(y, predicted) self.log.debug('{}-{}-模型績效..R2:{}'.format( modelname, config.modelFileKey, r2)) if hasattr(model, 'save'): model.save(h5File) else: joblib.dump(model, modelFile) ''' # 印出係數 print(lm.coef_) # 印出截距 print(lm.intercept_ ) # 模型績效 mse = np.mean((lm.predict(X) - y) ** 2) r_squared = lm.score(X, y) adj_r_squared = r_squared - (1 - r_squared) * (X.shape[1] / (X.shape[0] - X.shape[1] - 1)) # 印出模型績效 print('MSE:{0}'.format(mse)) print('R2:{0}'.format(r_squared)) print('adj_R2:{0}'.format(adj_r_squared)) ''' if hasattr(model, 'feature_importances_'): # Get numerical feature importances importances = list(model.feature_importances_) # List of tuples with variable and importance feature_importances = [ (feature, np.round(importance, 3)) for feature, importance in zip(feature_list, importances) ] # Sort the feature importances by most important first feature_importances = sorted(feature_importances, key=lambda x: x[1], reverse=True) # Print out the feature and importances #[print('Variable: {:20} Importance: {}'.format(*pair)) for pair in feature_importances] df_feature_importances = DataFrame(feature_importances) df_feature_importances.columns = ['Variable', '重要性_' + modelname] elif hasattr(model, 'coef_'): # Get numerical feature importances importances = list(model.coef_) # List of tuples with variable and importance feature_importances = [ (feature, abs(np.round(importance, 3))) for feature, importance in zip(feature_list, importances) ] # Sort the feature importances by most important first feature_importances = sorted(feature_importances, key=lambda x: x[1], reverse=True) # Print out the feature and importances #[print('Variable: {:20} Importance: {}'.format(*pair)) for pair in feature_importances] df_feature_importances = DataFrame(feature_importances) df_feature_importances.columns = ['Variable', '重要性_' + modelname] df_feature_importances['重要性_' + modelname] = ( df_feature_importances['重要性_' + modelname] * df_feature_importances['重要性_' + modelname].std()) df_feature_importances[ '重要性_' + modelname] = df_feature_importances[ '重要性_' + modelname] / df_feature_importances['重要性_' + modelname].sum() else: df_feature_importances = DataFrame() return model, modelname, df_feature_importances def loadTestingModel(self, config): self.log.debug('%s doTesting' % self.__class__.__name__) modelname = self.__class__.__name__ h5File = "./model/" + modelname + "_{0}.h5".format(config.modelFileKey) modelFile = "./model/" + modelname + "_{0}.model".format( config.modelFileKey) feature_list = config._featureList if (os.path.isfile(h5File)): print("training " + modelname + ":load model from file " + h5File) model = tf.keras.models.load_model(h5File) if (os.path.isfile(modelFile)): print("training " + modelname + ":load model from file " + modelFile) model = joblib.load(modelFile) if hasattr(model, 'feature_importances_'): # Get numerical feature importances importances = list(model.feature_importances_) # List of tuples with variable and importance feature_importances = [ (feature, np.round(importance, 3)) for feature, importance in zip(feature_list, importances) ] # Sort the feature importances by most important first feature_importances = sorted(feature_importances, key=lambda x: x[1], reverse=True) # Print out the feature and importances #[print('Variable: {:20} Importance: {}'.format(*pair)) for pair in feature_importances] df_feature_importances = DataFrame(feature_importances) df_feature_importances.columns = ['Variable', '重要性_' + modelname] elif hasattr(model, 'coef_'): # Get numerical feature importances importances = list(model.coef_) # List of tuples with variable and importance feature_importances = [ (feature, abs(np.round(importance, 3))) for feature, importance in zip(feature_list, importances) ] # Sort the feature importances by most important first feature_importances = sorted(feature_importances, key=lambda x: x[1], reverse=True) # Print out the feature and importances #[print('Variable: {:20} Importance: {}'.format(*pair)) for pair in feature_importances] df_feature_importances = DataFrame(feature_importances) df_feature_importances.columns = ['Variable', '重要性_' + modelname] df_feature_importances['重要性_' + modelname] = ( df_feature_importances['重要性_' + modelname] * df_feature_importances['重要性_' + modelname].std()) df_feature_importances[ '重要性_' + modelname] = df_feature_importances[ '重要性_' + modelname] / df_feature_importances['重要性_' + modelname].sum() else: df_feature_importances = DataFrame() return model, modelname, df_feature_importances @abc.abstractmethod def training(self, X, y): return NotImplemented
class Featurizer: log = Logger(name='MLFramework') @staticmethod def Select(df, targetfeat, config, limit=0.3): cateCols = config.encoderColumns ''' #1. Using Pearson Correlation ''' Featurizer.log.debug("Featurizer=====Using Pearson Correlation") # df = pd.concat(( df, pd.get_dummies(df,columns=cateCols,prefix_sep='_') ), axis=1).drop(cateCols,1) df = pd.get_dummies(df, columns=cateCols, prefix_sep='_') cor = df.corr() #Correlation with output variable cor_target = abs(cor[targetfeat]) #Selecting highly correlated features relevant_features = cor_target[cor_target > limit] Featurizer.log.debug(relevant_features.to_string()) #1. Using Pearson Correlation cor = df.corr() #Correlation with output variable cor_target = abs(cor[targetfeat]) #Selecting highly correlated features relevant_features = cor_target[cor_target > limit] Featurizer.log.debug("result============") Featurizer.log.debug('\n' + relevant_features.to_string()) ''' Wrapper Method: ''' Featurizer.log.debug("Featurizer=====Using statsmodels OLS model") #Adding constant column of ones, mandatory for sm.OLS model X = df.drop(targetfeat, 1).drop(config.xAxisCol, 1) #Feature Matrix y = df[targetfeat] #Target Variable X_1 = sm.add_constant(X) #Fitting sm.OLS model model = sm.OLS(y, X_1).fit() Featurizer.log.debug("result============") Featurizer.log.debug(model.pvalues.to_string()) ''' # 2.Backward Elimination ''' # cols = list(X.columns) # pmax = 1 # while (len(cols)>0): # p= [] # X_1 = X[cols] # X_1 = sm.add_constant(X_1) # print("len(cols) :",len(cols),X_1.shape) # model = sm.OLS(y,X_1).fit() # p = pd.Series(model.pvalues.values[:],index = cols) # pmax = max(p) # feature_with_p_max = p.idxmax() # if(pmax>0.05): # cols.remove(feature_with_p_max) # print("cols.remove(feature_with_p_max):",feature_with_p_max, ' check:',feature_with_p_max in cols) # else: # break # selected_features_BE = cols # Featurizer.log.debug("result============") # Featurizer.log.debug(selected_features_BE) ''' # 2.Embedded Method ''' reg = LassoCV() reg.fit(X, y) Featurizer.log.debug("Best alpha using built-in LassoCV: %f" % reg.alpha_) Featurizer.log.debug("Best score using built-in LassoCV: %f" % reg.score(X, y)) coef = pd.Series(reg.coef_, index=X.columns) Featurizer.log.debug("Lasso picked " + str(sum(coef != 0)) + " variables and eliminated the other " + str(sum(coef == 0)) + " variables") imp_coef = coef.sort_values(ascending=True) Featurizer.log.debug('Lasso coef \n' + imp_coef.to_string())
def __init__(self): self._config = MLConfig() self.log = Logger(name='MLFramework') self.log.debug('ML Base init..%s' % self.__class__.__name__)
class MLBase(metaclass=abc.ABCMeta): ver = "MLFramework v0.01" def __init__(self): self._config = MLConfig() self.log = Logger(name='MLFramework') self.log.debug('ML Base init..%s' % self.__class__.__name__) @property def config(self): return self._config @config.setter def config(self, value): self._config = value # def get_next_toolg_wip(self, my_date,my_version,my_days,my_toolg=None): # params_list ={'my_date':my_date,'my_version':my_version,'my_day s':my_days} # # query = "Exec PPM.dbo.sp_GenPredictData %(my_date)s , %(my_version)s ,%(my_days)s " # query = "Exec PPM.dbo.sp_GenPredictDataWeekly %(my_date)s , %(my_version)s ,%(my_days)s " # df = pd.read_sql(query, self.conn, params=params_list) # self.conn.commit() # if my_toolg is not None: # df = df.loc[df['TOOLG_ID'].isin(my_toolg)] # df = df.reset_index(drop=True) # return df def getDataFromDB(self): """ def 撈取DB """ dataSource = self.config.dataSource[0] db_name = dataSource['DB'] query = ['select * from %s ' % dataSource['TABLE']] if 'CONDITION' in dataSource.keys(): if len(dataSource['CONDITION']) > 1: for i in range(len(dataSource['CONDITION'])): if i == 0: query.append('where ') else: query.append(' AND ') if dataSource['CONDITION'][i]['operator'] == 'in': query.append(' {} {} (\'{}\')'.format( dataSource['CONDITION'][i]['column'], dataSource['CONDITION'][i]['operator'], '\',\''.join(dataSource['CONDITION'][i] ['value'].split(',')))) else: if dataSource['CONDITION'][i]['value'] == 'null' and ( dataSource['CONDITION'][i]['operator'] == '!=' or dataSource['CONDITION'][i]['operator'] == '=='): query.append(' {} {} {} '.format( dataSource['CONDITION'][i]['column'], dataSource['CONDITION'][i]['operator'], dataSource['CONDITION'][i]['value'])) else: query.append(' {} {} \'{}\' '.format( dataSource['CONDITION'][i]['column'], dataSource['CONDITION'][i]['operator'], dataSource['CONDITION'][i]['value'])) conn = db_engine.DBEngine(db_name) self.dfInputData = conn.Query(' '.join(query)) self.dfInputData.to_csv(self.config.datafile, index=False) def getMergeDataFile(self): self.dfInputData, self.strColumnlist, self.numbericColumnlist, self.nullColumnlist = Data.merge( self.config.dataFiles) self.dfInputData.to_csv(self.config.datafile, index=False) def getInputData(self): self.dfInputData, self.strColumnlist, self.numbericColumnlist, self.nullColumnlist = Data.readData( self.config.datafile) def filterData(self): self.dfInputData = Data.filterDataframe(self.dfInputData, self.config.InputDataCondition) # self.dfInputData,self.strColumnlist,self.numbericColumnlist,self.nullColumnlist=Data.readDataFrame(self.dfInputData) @abc.abstractmethod def dataTransform(self): return NotImplemented def filterColumns(self): self.dfInputData = Data.filterColumns(self.dfInputData, self.config) self.dfInputData, self.strColumnlist, self.numbericColumnlist, self.nullColumnlist = Data.analyzeData( self.dfInputData) if self.config.forceRetrain == True: self.dfTraining = Data.filterColumns(self.dfTraining, self.config) self.dfTesting = Data.filterColumns(self.dfTesting, self.config) # def scalerData(self): # self.dfInputData=Data.scalerData(self.dfInputData,'MinMaxScaler',self.numbericColumnlist,self.config) # print(self.dfInputData) # @abc.abstractmethod # def featureTransform(self): # return NotImplemented @abc.abstractmethod def getTrainingData(self): return NotImplemented @abc.abstractmethod def getTestingData(self): return NotImplemented def genHTMLReport(self, template='template.html'): pd.set_option("display.precision", 3) htmlRender = {} ___acc = '' for i in range(len(self.config.runModel)): mClass = self.config.runModel[i] htmlRender['fitable{0}'.format(i + 1)] = ( self.mFeatureImportances[mClass].style.render()) if mClass != 'LSTMModel': htmlRender['sstable{0}'.format(i + 1)] = ( ModelAnalysis.sensitivityAnalysis( self.model[mClass], self.mlKind[mClass], self.dfInputData, self.config).style.render()) # ___acc='{0} = {1},{2},{3}<br/>'.format(mClass,self.acc[mClass][0],self.acc[mClass][1],self.acc[mClass][2]) if template != 'template.html': htmlRender['rawData{0}'.format(i + 1)] = ( self.showRows[mClass].style.render()) htmlRender['accdata'] = (pd.DataFrame( self.acc, columns=list(self.acc.keys()), index=['acc', 'acc by axis', 'total acc']).style.render()) htmlRender['ploimage'] = '{0}_plot.svg'.format( self.config.modelFileKey) htmlRender['nowDT'] = datetime.now().strftime("%Y/%m/%d %H:%M:%S") htmlRender['reportname'] = self.config.reportName # Template handling env = jinja2.Environment(loader=jinja2.FileSystemLoader(searchpath='')) template = env.get_template(template) html = template.render(htmlRender) #html = template.render(my_table="AAA") # Write the HTML file path = os.path.abspath('./Report/{0}_report.html'.format( self.config.modelFileKey)) url = 'file://' + path with open(path, 'w') as f: f.write(html) webbrowser.open(url) def __getDATA(self): ''' 讀取DB 到local ''' print(bcolors.HEADER + "===" + MLBase.ver + "===================================" + bcolors.ENDC) print(bcolors.WARNING + "===[input]讀取DB==================" + bcolors.ENDC) self.log.debug("===Data Merge===================%s" % self.__class__.__name__) if hasattr(self.config, 'dataSource'): self.getDataFromDB() ''' 讀取資料合併 ''' print(bcolors.HEADER + "===" + MLBase.ver + "===================================" + bcolors.ENDC) print(bcolors.WARNING + "===[input]資料合併===================" + bcolors.ENDC) self.log.debug("===Data Merge===================%s" % self.__class__.__name__) if hasattr(self.config, 'dataFiles'): self.getMergeDataFile() ''' 讀取資料csv (from self.config.datafile) ''' print(bcolors.WARNING + "===[input]讀取資料===================" + bcolors.ENDC) self.log.debug("===Fetch Data===================%s" % self.__class__.__name__) self.getInputData() print(bcolors.WARNING + "===[input]資料過濾===================" + bcolors.ENDC) self.log.debug("===Filter Input Data===================%s" % self.__class__.__name__) if hasattr(self.config, 'InputDataCondition'): self.filterData() if self.dfInputData.shape[0] == 0: print(bcolors.WARNING + "資料筆數 : 0" + bcolors.ENDC) self.log.debug("資料筆數 : 0 Rows {}, Columns {}".format( self.dfInputData.shape[0], self.dfInputData.shape[1])) return print(bcolors.WARNING + "===[input]資料轉換===================" + bcolors.ENDC) self.log.debug("===Data Transform===================%s" % self.__class__.__name__) self.dataTransform() ''' 訓練集 & 測試集 ''' print(bcolors.WARNING + "===[input]篩選訓練集 & 測試集================" + bcolors.ENDC) if hasattr(self.config, 'TrainCondition') & hasattr( self.config, 'TestCondition'): self.dfTraining = Data.filterDataframe(self.dfInputData, self.config.TrainCondition) self.dfTesting = Data.filterDataframe(self.dfInputData, self.config.TestCondition) else: self.dfTraining = self.getTrainingData() self.dfTesting = self.getTestingData() print(bcolors.WARNING + "===[input]過濾資料===================" + bcolors.ENDC) self.log.debug("===Data Filter===================%s" % self.__class__.__name__) self.filterColumns() # 拆分訓練集 測是集 def EDAAnalysis(self): self.__getDATA() EDA.analysis(self.dfInputData, self.config.targetCol, self.config) def EDACompare(self): self.__getDATA() EDA.compare(self.dfTraining, self.dfTesting, self.config.targetCol, self.config) def FeatureSelect(self): self.__getDATA() Featurizer.Select(self.dfInputData, self.config.targetCol, self.config) ''' chekck 訓練集 測試集 有資料(如果筆數為0 則停止跑模型) ''' def checkDFSetHasData(self): print(bcolors.WARNING + "資料筆數 : ({},{})".format( self.dfInputData.shape[0], self.dfInputData.shape[1]) + bcolors.ENDC) print(bcolors.WARNING + "Training Set 資料筆數 : ({},{})".format( self.dfTraining.shape[0], self.dfTraining.shape[1]) + bcolors.ENDC) print(bcolors.WARNING + "Testing Set 資料筆數 : ({},{})".format( self.dfTesting.shape[0], self.dfTesting.shape[1]) + bcolors.ENDC) if self.dfInputData.shape[0] == 0: self.log.debug("Input Set 資料筆數 : 0 ") return False if self.dfTraining.shape[0] == 0: self.log.debug("Training Set 資料筆數 : 0 ") return False if self.dfTesting.shape[0] == 0: self.log.debug("Testing Set 資料筆數 : 0 ") return False return True def run(self): if hasattr(self, 'initConfigSetting'): if callable(self.initConfigSetting): self.initConfigSetting() self.__getDATA() print(bcolors.WARNING + "===GET DATA END==================" + bcolors.ENDC) print(bcolors.WARNING + "dfTraining describe----------------" + bcolors.ENDC) self.log.debug('\n' + self.dfTraining.describe().to_string()) print(bcolors.WARNING + "dfTesting describe----------------" + bcolors.ENDC) self.log.debug('\n' + self.dfTesting.describe().to_string()) ''' 資料預處理 ''' print(bcolors.WARNING + "===填補遺漏值==================" + bcolors.ENDC) self.log.debug("===填補遺漏值==================%s" % self.__class__.__name__) # self.fillnull() self.dfTraining = Data.fillnull(self.dfTraining, self.nullColumnlist, self.config.fillNaType.value) self.dfTesting = Data.fillnull(self.dfTesting, self.nullColumnlist, self.config.fillNaType.value) self.dfOriTesting = self.dfTesting.copy(deep=False) self.dfTraining.to_csv("./Report/" + self.config.modelFileKey + '_Training.csv') if not self.checkDFSetHasData(): return print(bcolors.WARNING + "===特徵縮放===================" + bcolors.ENDC) self.log.debug("===特徵縮放===================%s" % self.__class__.__name__) # self.scalerData() if not (hasattr(self.config, 'scalerKind') or hasattr(self.config, 'muiltiScalerKind')): self.config.scalerKind = scalerKind.MINMAX if hasattr(self.config, 'scalerKind'): self.dfTraining = Data.scalerData(self.dfTraining, self.config.scalerKind.value, self.numbericColumnlist, self.config, isTrain=True) self.dfTesting = Data.scalerData(self.dfTesting, self.config.scalerKind.value, self.numbericColumnlist, self.config, isTrain=False) elif hasattr(self.config, 'muiltiScalerKind'): self.dfTraining = Data.multiScalerData(self.dfTraining, self.numbericColumnlist, self.config, isTrain=True) self.dfTesting = Data.multiScalerData(self.dfTesting, self.numbericColumnlist, self.config, isTrain=False) print(bcolors.WARNING + "===特徵轉換===================" + bcolors.ENDC) self.log.debug("===特徵轉換===================%s" % self.__class__.__name__) # self.featureTransform() # self.dfInputDataRaw= self.dfTraining.copy(deep=False) self.dfTraining_eh = Data.featureTransform( self.dfTraining, self.config, True) # exclude target_cols xAxisCol self.dfTraining_eh.to_csv("./log/" + self.config.modelFileKey + '_Training_eh.csv') self.dfTesting_eh = Data.featureTransform( self.dfTesting, self.config, False) # exclude target_cols xAxisCol self.dfTesting_eh.to_csv("./log/" + self.config.modelFileKey + '_Testing_eh.csv') # self.dfTraining = self.getTrainingData() # if hasattr(self.config, 'TrainCondition') and hasattr(self.config, 'TestCondition'): # cols = [ sub['column'] for sub in self.config.TrainCondition+self.config.TestCondition ] # cols = [k for k, g in groupby(sorted(cols))] # self.dfTraining= self.dfTraining.drop(columns=cols) self.dfInputData = self.dfTraining_eh print(bcolors.WARNING + "===Ready for Training===================" + bcolors.ENDC) self.log.debug("===Ready for Training===================%s" % self.__class__.__name__) # self.dfTraining_eh= self.dfTraining_eh.drop([x for x in [self.config.xAxisCol] if x in self.dfTraining_eh.columns], axis=1) self.X = np.asarray(self.dfTraining_eh) self.y = np.asarray(self.dfTraining[self.config.targetCol]) print(bcolors.WARNING + "===Ready for Testing===================" + bcolors.ENDC) self.log.debug("===Ready for Testing===================%s" % self.__class__.__name__) # self.dfOriTesting = self.getTestingData() # self.dfTesting = self.dfOriTesting.copy(deep=False) # self.dfTesting_eh = self.dfTesting_eh.drop([x for x in [self.config.xAxisCol] if x in self.dfTesting_eh.columns], axis=1) self.XTest = np.asarray(self.dfTesting_eh) ''' 模型訓練 ''' print(bcolors.OKBLUE + "===訓練模型====================" + bcolors.ENDC) self.log.debug("===Model Training===================%s" % self.__class__.__name__) self.config._featureList = list(self.dfTraining_eh.columns) print(bcolors.WARNING + "_featureList : " + ''.join(self.config._featureList) + bcolors.ENDC) self.log.debug("_featureList : {} \n".format(' , '.join( self.config._featureList))) #self.config._featureList=list(self.dfTraining.drop(self.config.targetCol, axis=1).columns) self.model = {} self.mlKind = {} self.mFeatureImportances = {} self.acc = {} for i in range(len(self.config.runModel)): mClass = self.config.runModel[i] mObj = getattr(globals()[mClass], mClass)() if mClass == 'LSTMModel': self.X = np.reshape(self.X, (self.X.shape[0], self.X.shape[1], 1)) for i in range(1, 5): self.model[mClass], self.mlKind[ mClass], self.mFeatureImportances[ mClass] = mObj.doTraining(self.X, self.y, self.config) _validation_config = copy.deepcopy(self.config) _validation_config.modelFileKey = _validation_config.modelFileKey + "_Val" _acc, _, _, _ = Data.testModel(self.X, self.model[mClass], self.mlKind[mClass], self.dfTraining, _validation_config) if _acc > 40: self.log.debug( "doTraining>0.6 ---- No.{} => {} PASS.".format( i, _acc)) break print(bcolors.OKBLUE + "===測試模型====================" + bcolors.ENDC) self.log.debug("===Model Testing===================%s" % self.__class__.__name__) ''' 模型測試 ''' plt.style.use('ggplot') plt.figure(figsize=(20, 6 * len(self.config.runModel)), dpi=60) for i in range(len(self.config.runModel)): plt.subplot(len(self.config.runModel) * 100 + 10 + 1 + i) mClass = self.config.runModel[i] if mClass == 'LSTMModel': self.XTest = np.reshape( self.XTest, (self.XTest.shape[0], self.XTest.shape[1], 1)) _acc, _accsum, _totol_acc, _ = Data.testModel( self.XTest, self.model[mClass], self.mlKind[mClass], self.dfOriTesting, self.config) self.acc[mClass] = [_acc, _accsum, _totol_acc] plt.tight_layout() plt.savefig('./Report/{0}_plot.svg'.format(self.config.modelFileKey)) ''' 產生報表 ''' print(bcolors.OKBLUE + "===產生報表====================" + bcolors.ENDC) self.log.debug("===Create Report===================%s" % self.__class__.__name__) self.genHTMLReport() def runPredict(self): if hasattr(self, 'initConfigSetting'): if callable(self.initConfigSetting): self.initConfigSetting() self.__getDATA() ''' 資料預處理 ''' print(bcolors.WARNING + "===填補遺漏值==================" + bcolors.ENDC) self.log.debug("===填補遺漏值==================%s" % self.__class__.__name__) # self.fillnull() # self.dfTraining = Data.fillnull(self.dfTraining, self.nullColumnlist, self.config.fillNaType.value) self.dfTesting = Data.fillnull(self.dfTesting, self.nullColumnlist, self.config.fillNaType.value) self.dfOriTesting = self.dfTesting.copy(deep=True) print(bcolors.WARNING + "===特徵縮放===================" + bcolors.ENDC) self.log.debug("===特徵縮放===================%s" % self.__class__.__name__) if not hasattr(self.config, 'scalerKind'): self.config.scalerKind = scalerKind.MINMAX # self.dfTraining = Data.scalerData(self.dfTraining, self.config.scalerKind.value,self.numbericColumnlist,self.config, isTrain=True) self.dfTesting = Data.scalerData(self.dfTesting, self.config.scalerKind.value, self.numbericColumnlist, self.config, isTrain=False) print(bcolors.WARNING + "===特徵轉換===================" + bcolors.ENDC) self.log.debug("===特徵轉換===================%s" % self.__class__.__name__) # self.featureTransform() # self.dfInputDataRaw= self.dfTraining.copy(deep=False) # self.dfTraining_eh = Data.featureTransform(self.dfTraining, self.config,True) # exclude target_cols xAxisCol # self.dfTraining_eh.to_csv("./log/"+self.config.modelFileKey+'_Training.csv') self.dfTesting_eh = Data.featureTransform( self.dfTesting, self.config, False) # exclude target_cols xAxisCol self.dfTesting_eh.to_csv("./report/" + self.config.modelFileKey + '_Testing.csv') print(bcolors.WARNING + "===Ready for Testing===================" + bcolors.ENDC) self.log.debug("===Ready for Testing===================%s" % self.__class__.__name__) self.XTest = np.asarray(self.dfTesting_eh) self.dfInputData = self.dfTesting_eh ''' 模型訓練 ''' print(bcolors.OKBLUE + "===訓練模型====================" + bcolors.ENDC) self.log.debug("===Model Training===================%s" % self.__class__.__name__) self.config._featureList = list(self.dfTesting_eh.columns) print(bcolors.WARNING + "_featureList : " + ''.join(self.config._featureList) + bcolors.ENDC) self.log.debug("_featureList : {} \n".format(' , '.join( self.config._featureList))) #self.config._featureList=list(self.dfTraining.drop(self.config.targetCol, axis=1).columns) self.model = {} self.mlKind = {} self.mFeatureImportances = {} self.acc = {} self.showRows = {} # for i in range(len(self.config.runModel)): # mClass=self.config.runModel[i] # mObj = getattr(globals()[mClass], mClass)() # if mClass =='LSTMModel': # self.X = np.reshape(self.X, (self.X.shape[0], self.X.shape[1], 1)) # self.model[mClass], self.mlKind[mClass], self.mFeatureImportances[mClass] = mObj.doTraining(self.X, self.y, self.config) # print(bcolors.OKBLUE + "===測試模型====================" + bcolors.ENDC) self.log.debug("===Model Testing===================%s" % self.__class__.__name__) ''' 模型測試 ''' plt.style.use('ggplot') plt.figure(figsize=(20, 6 * len(self.config.runModel)), dpi=60) for i in range(len(self.config.runModel)): plt.subplot(len(self.config.runModel) * 100 + 10 + 1 + i) mClass = self.config.runModel[i] mObj = getattr(globals()[mClass], mClass)() if mClass == 'LSTMModel': self.XTest = np.reshape( self.XTest, (self.XTest.shape[0], self.XTest.shape[1], 1)) self.model[mClass], self.mlKind[mClass], self.mFeatureImportances[ mClass] = mObj.loadTestingModel(self.config) _acc, _accsum, _totol_acc, _showRow = Data.testModel( self.XTest, self.model[mClass], self.mlKind[mClass], self.dfOriTesting, self.config) self.acc[mClass] = [_acc, _accsum, _totol_acc] _showRow.insert(loc=0, column='model', value=mClass) self.showRows[mClass] = _showRow plt.tight_layout() plt.savefig('./Report/{0}_plot.svg'.format(self.config.modelFileKey)) ''' 產生報表 ''' print(bcolors.OKBLUE + "===產生報表====================" + bcolors.ENDC) self.log.debug("===Create Report===================%s" % self.__class__.__name__) self.genHTMLReport('template2.html')
class MLBase(metaclass=abc.ABCMeta): ver = "MLFramework v0.01" def __init__(self): self._config = MLConfig() self.log = Logger(name='MLFramework') self.log.debug('ML Base init..%s' % self.__class__.__name__) @property def config(self): return self._config @config.setter def config(self, value): self._config = value def getMergeDataFile(self): self.dfInputData, self.strColumnlist, self.numbericColumnlist, self.nullColumnlist = Data.merge( self.config.dataFiles) self.dfInputData.to_csv(self.config.datafile, index=False) def getInputData(self): self.dfInputData, self.strColumnlist, self.numbericColumnlist, self.nullColumnlist = Data.readData( self.config.datafile) def filterData(self): self.dfInputData = Data.filterDataframe(self.dfInputData, self.config.InputDataCondition) self.dfInputData, self.strColumnlist, self.numbericColumnlist, self.nullColumnlist = Data.readDataFrame( self.dfInputData) @abc.abstractmethod def dataTransform(self): return NotImplemented def filterColumns(self): self.dfInputData = Data.filterColumns(self.dfInputData, self.config) self.dfInputData, self.strColumnlist, self.numbericColumnlist, self.nullColumnlist = Data.analyzeData( self.dfInputData) def fillnull(self): if hasattr(self.config, 'fillNaType'): if (self.config.fillNaType.value == 'mean'): self.dfInputData[self.nullColumnlist] = self.dfInputData[ self.nullColumnlist].fillna( self.dfInputData.median()).fillna(value=0) elif (self.fillNaType.value == 'mode'): self.dfInputData = self.dfInputData.fillna( self.dfInputData.mode()) elif (self.fillNaType.value == 'bfill'): self.dfInputData = self.dfInputData.fillna( method='bfill').fillna(self.dfInputData.median()) elif (self.fillNaType.value == 'ffill'): self.dfInputData = self.dfInputData.fillna( method='ffill').fillna(self.dfInputData.median()) elif (self.fillNaType.value == 'dropna'): self.dfInputData = self.dfInputData.dropna() elif (self.fillNaType.value == 'zero'): self.dfInputData[self.nullColumnlist] = self.dfInputData[ self.nullColumnlist].fillna(0) else: self.dfInputData[self.nullColumnlist] = self.dfInputData[ self.nullColumnlist].fillna( self.dfInputData.median()).fillna(value=0) def scalerData(self): self.dfInputData = Data.scalerData(self.dfInputData, 'MinMaxScaler', self.numbericColumnlist, self.config) print(self.dfInputData) @abc.abstractmethod def featureTransform(self): return NotImplemented @abc.abstractmethod def getTrainingData(self): return NotImplemented @abc.abstractmethod def getTestingData(self): return NotImplemented def genHTMLReport(self): pd.set_option("display.precision", 3) htmlRender = {} for i in range(len(self.config.runModel)): mClass = self.config.runModel[i] htmlRender['fitable{0}'.format(i + 1)] = ( self.mFeatureImportances[mClass].style.render()) if mClass != 'LSTMModel': htmlRender['sstable{0}'.format(i + 1)] = ( ModelAnalysis.sensitivityAnalysis( self.model[mClass], self.mlKind[mClass], self.dfInputData, self.config).style.render()) htmlRender['ploimage'] = '{0}_plot.svg'.format( self.config.modelFileKey) # Template handling env = jinja2.Environment(loader=jinja2.FileSystemLoader(searchpath='')) template = env.get_template('template.html') html = template.render(htmlRender) #html = template.render(my_table="AAA") # Write the HTML file path = os.path.abspath('./Report/{0}_report.html'.format( self.config.modelFileKey)) url = 'file://' + path with open(path, 'w') as f: f.write(html) webbrowser.open(url) def run(self): print(bcolors.HEADER + "===" + MLBase.ver + "===================================" + bcolors.ENDC) print(bcolors.WARNING + "===資料合併===================" + bcolors.ENDC) self.log.debug("===Data Merge===================%s" % self.__class__.__name__) if hasattr(self.config, 'dataFiles'): self.getMergeDataFile() print(bcolors.WARNING + "===讀取資料===================" + bcolors.ENDC) self.log.debug("===Fetch Data===================%s" % self.__class__.__name__) self.getInputData() print(bcolors.WARNING + "===資料過濾===================" + bcolors.ENDC) self.log.debug("===Filter Input Data===================%s" % self.__class__.__name__) if hasattr(self.config, 'InputDataCondition'): self.filterData() print(bcolors.WARNING + "===資料轉換===================" + bcolors.ENDC) self.log.debug("===Data Transform===================%s" % self.__class__.__name__) self.dataTransform() print(bcolors.WARNING + "===過濾資料===================" + bcolors.ENDC) self.log.debug("===Data Filter===================%s" % self.__class__.__name__) self.filterColumns() print(bcolors.WARNING + "===填補遺漏值==================" + bcolors.ENDC) self.log.debug("===fill None==================%s" % self.__class__.__name__) self.fillnull() self.dfInputData.info() print(bcolors.WARNING + "===特徵縮放===================" + bcolors.ENDC) self.log.debug("===scale===================%s" % self.__class__.__name__) self.scalerData() print(bcolors.WARNING + "===特徵轉換===================" + bcolors.ENDC) self.log.debug("===feature Transform===================%s" % self.__class__.__name__) self.featureTransform() self.dfInputData.info() print(bcolors.WARNING + "===準備訓練資料================" + bcolors.ENDC) self.log.debug("===Ready for Training===================%s" % self.__class__.__name__) self.dfTraining = self.getTrainingData() # if hasattr(self.config, 'TrainCondition') and hasattr(self.config, 'TestCondition'): # cols = [ sub['column'] for sub in self.config.TrainCondition+self.config.TestCondition ] # cols = [k for k, g in groupby(sorted(cols))] # self.dfTraining= self.dfTraining.drop(columns=cols) self.dfTraining = self.dfTraining.drop([ x for x in [self.config.xAxisCol] if x in self.dfTraining.columns ], axis=1) self.X = np.asarray(self.dfTraining.drop(self.config.targetCol, axis=1)) self.y = np.asarray(self.dfTraining[self.config.targetCol]) print(bcolors.WARNING + "===準備測試資料================" + bcolors.ENDC) self.log.debug("===Ready for Testing===================%s" % self.__class__.__name__) self.dfOriTesting = self.getTestingData() self.dfTesting = self.dfOriTesting.copy(deep=False) self.dfTesting = self.dfTesting.drop( [x for x in [self.config.xAxisCol] if x in self.dfTesting.columns], axis=1) self.XTest = np.asarray( self.dfTesting.drop(self.config.targetCol, axis=1)) print(bcolors.OKBLUE + "===訓練模型====================" + bcolors.ENDC) self.log.debug("===Model Training===================%s" % self.__class__.__name__) self.config._featureList = list( self.dfTraining.drop(self.config.targetCol, axis=1).columns) self.model = {} self.mlKind = {} self.mFeatureImportances = {} for i in range(len(self.config.runModel)): mClass = self.config.runModel[i] mObj = getattr(globals()[mClass], mClass)() if mClass == 'LSTMModel': self.X = np.reshape(self.X, (self.X.shape[0], self.X.shape[1], 1)) self.model[mClass], self.mlKind[mClass], self.mFeatureImportances[ mClass] = mObj.doTraining(self.X, self.y, self.config) print(bcolors.OKBLUE + "===測試模型====================" + bcolors.ENDC) self.log.debug("===Model Testing===================%s" % self.__class__.__name__) plt.style.use('ggplot') plt.figure(figsize=(20, 6 * len(self.config.runModel)), dpi=60) for i in range(len(self.config.runModel)): plt.subplot(len(self.config.runModel) * 100 + 10 + 1 + i) mClass = self.config.runModel[i] if mClass == 'LSTMModel': self.XTest = np.reshape( self.XTest, (self.XTest.shape[0], self.XTest.shape[1], 1)) Data.testModel(self.XTest, self.model[mClass], self.mlKind[mClass], self.dfOriTesting, self.config) plt.tight_layout() plt.savefig('./Report/{0}_plot.svg'.format(self.config.modelFileKey)) print(bcolors.OKBLUE + "===產生報表====================" + bcolors.ENDC) self.log.debug("===Create Report===================%s" % self.__class__.__name__) self.genHTMLReport()
class Data: log = Logger(name='MLFramework') @staticmethod def readData(inputfile): Data.log.debug('readData ==> %s' % inputfile) df = pd.read_csv(inputfile) df = df.dropna(axis=1, how='all') df.info() return Data.analyzeData(df) @staticmethod def readDataFrame(df): df = df.dropna(axis=1, how='all') return Data.analyzeData(df) @staticmethod def merge(dataFiles): index = 0 for dfFile in dataFiles['files']: print(dfFile) if index == 0: _dfInputData1, _strColumnlist1, _numbericColumnlist1, _nullColumnlist1 = Data.readData( dfFile) # _df_result. else: datasetRels = dataFiles['relations'][index - 1] _dfInputData2, _strColumnlist2, _numbericColumnlist2, _nullColumnlist2 = Data.readData( dfFile) _dfInputData1.set_index(datasetRels[0]) _dfInputData2.set_index(datasetRels[1]) df_merge = Data.mergeDataFrame(_dfInputData1, _dfInputData2, datasetRels[0], datasetRels[1]) _dfInputData1 = df_merge.copy(deep=False) index += 1 return Data.analyzeData(df_merge) @staticmethod def mergeDataFrame(dfleft, dfright, LeftKeys, RightKeys): # dfright.columns = [str(col) + '_'+joinTableName for col in df.columns] df_merge = pd.merge(dfleft, dfright, left_on=LeftKeys, right_on=RightKeys, how="inner") return df_merge @staticmethod def analyzeData(df): print('非數值欄位:') strColumnlist = df.select_dtypes( exclude=['int64', 'float64']).columns.tolist() print(strColumnlist) print('數值欄位:') numbericColumnlist = df.select_dtypes( include=['int64', 'float64']).columns.tolist() print(numbericColumnlist) print('包含NULL的欄位:') nullColumnlist = df.columns[df.isna().any()].tolist() print(nullColumnlist) print('計算NULL筆:') print( pd.DataFrame({ 'COUNT': df.isnull().sum(), 'Missing Ratio': (df.isnull().sum() * 100 / df.shape[0]) })) Data.log.debug('\n' + pd.DataFrame({ 'COUNT': df.isnull().sum(), 'Missing Ratio': (df.isnull().sum() * 100 / df.shape[0]) }).to_string()) Data.log.debug('\n' + df.describe().to_string()) print('===================================================') return df, strColumnlist, numbericColumnlist, nullColumnlist @staticmethod def filterDataframe(df, condition): for c in condition: if c['operator'] == "=": df = df[df[c['column']] == c['value']] elif c['operator'] == "=!" or c['operator'] == "!=": df = df[df[c['column']] != c['value']] elif c['operator'] == "<=": df = df[df[c['column']] <= c['value']] elif c['operator'] == "<": df = df[df[c['column']] < c['value']] elif c['operator'] == ">=": df = df[df[c['column']] >= c['value']] elif c['operator'] == ">": df = df[df[c['column']] > c['value']] return df @staticmethod def fillnull(df, nullColumnlist, fillType): if (fillType == 'mean'): # df[nullColumnlist] = df[nullColumnlist].fillna(df.median()).fillna(value=0) df = df.fillna(df.median()).fillna(value=0) elif (fillType == 'mode'): df = df.fillna(df.mode()) elif (fillType == 'bfill'): df = df.fillna(method='bfill').fillna(df.median()) elif (fillType == 'ffill'): df = df.fillna(method='ffill').fillna(df.median()) elif (fillType == 'dropna'): df = df.dropna() elif (fillType == 'zero'): df = df.fillna(0) return df @staticmethod def filterColumns(df, config): try: includeColumns = config.includeColumns excludeColumns = config.excludeColumns if (len(includeColumns) > 0): df = df[includeColumns] if (len(excludeColumns) > 0): df = df.drop(columns=excludeColumns) return df except: Data.log.error("filterColumns error:", sys.exc_info()[0]) #Data.log.error(error) raise # just this! # raise AppError # Don't do this, you'll lose the stack trace! @staticmethod def scalerData(df, scalerKind, numbericColumnlist, config, isTrain=True): if len(numbericColumnlist) > 0: target_cols = config.targetCol scalerColumnlist = [ ele for ele in numbericColumnlist if ele not in target_cols ] scalerColumnlist = list( set(scalerColumnlist).intersection(df.columns)) if isTrain: scaler = None if (scalerKind == 'standard'): scaler = StandardScaler() elif (scalerKind == 'minmax'): scaler = MinMaxScaler() elif (scalerKind == 'MinMaxScaler'): scaler = MinMaxScaler() elif (scalerKind == 'robust'): scaler = RobustScaler() elif (scalerKind == 'maxabs'): scaler = MaxAbsScaler() elif (scalerKind == 'normal'): scaler = Normalizer() else: scaler = MinMaxScaler() scaler.fit(df[scalerColumnlist]) df[scalerColumnlist] = scaler.transform(df[scalerColumnlist]) dump( scaler, open('model/scaler_{}.pkl'.format(config.modelFileKey), 'wb')) else: scaler = load( open('model/scaler_{}.pkl'.format(config.modelFileKey), 'rb')) df[scalerColumnlist] = scaler.transform(df[scalerColumnlist]) return df @staticmethod def featureTransform(df, config, isTrain=True): if len(config.encoderColumns) > 0: target_cols = config.targetCol df = pd.get_dummies(df.drop([target_cols, config.xAxisCol], axis=1), columns=config.encoderColumns, prefix_sep='_') if isTrain: df.head(0).to_csv('model/eh_{}.csv'.format( config.modelFileKey), index=0) #不保存行索引 else: df_eh = pd.read_csv('model/eh_{}.csv'.format( config.modelFileKey)) #不保存行索引 df = df.reindex(columns=df_eh.columns, fill_value=0) # Ensure the order of column in the test set is in the same order than in train set df = df[df_eh.columns] return df @staticmethod def accsum(def_result, target_cols): _accsum = 0 for index, row in def_result.iterrows(): #避免當分母為0 會無法計算 if row[target_cols] == 0 and row['Predict'] == 0: row[target_cols] = 1 row['Predict'] = 1 elif row[target_cols] == 0 and row['Predict'] != 0: row[target_cols] = 0.00001 if row[target_cols] < 0: row[target_cols] = 0.00001 if row['Predict'] < 0: row['Predict'] = 0 if 1 - abs( (row['Predict'] - row[target_cols]) / row[target_cols]) > 0: _accsum += (1 - abs( (row['Predict'] - row[target_cols]) / row[target_cols])) return round(_accsum * 100 / def_result.shape[0], 2) @staticmethod def testModel(XTest, model, mlKind, dfOri, config): yTest = model.predict(XTest) df2 = dfOri.copy(deep=False) df2.insert(len(df2.columns), 'Predict', yTest) plt.title((mlKind + ":{0}%").format(Data.accsum(df2, config.targetCol))) plt.xlabel(config.xAxisCol) plt.xticks(rotation=90) plt.ylabel(config.targetCol) df2 = df2.sort_values(config.xAxisCol, ascending=True) t = df2[config.xAxisCol].to_numpy() + '_' + np.arange( len(XTest)).astype(str) # 创建t变量 plt.plot(t, df2['Predict'], label=mlKind, color='red', marker='.', linewidth='0.5') plt.plot(t, df2[config.targetCol], label="ACT", color='blue', marker='x', linewidth='0') plt.legend() plt.ylim(bottom=0) df2.to_csv('./Report/' + config.modelFileKey + '_' + mlKind + '.csv', index=False) _acc = mlKind, Data.accsum(df2, config.targetCol) print(mlKind + ' ' + config.modelFileKey + " Test acc : %.2f" % _acc[1]) Data.log.debug(mlKind + ' ' + config.modelFileKey + " Test acc: %.2f" % _acc[1]) _accsum = 0 def_result_summary = df2.groupby(config.xAxisCol, as_index=False).sum().reset_index()[[ config.xAxisCol, config.targetCol, 'Predict' ]] if (def_result_summary.shape[0] > 1): _acc = mlKind, Data.accsum(def_result_summary, config.targetCol) Data.log.debug(mlKind + ' ' + config.modelFileKey + " Test group by x-axis acc: %.2f" % _acc[1]) print(mlKind + " Test group by x-axis acc: %.2f" % _acc[1]) def_result_summary = df2[[config.targetCol, 'Predict']].sum() if (def_result_summary[config.targetCol] != 0): totol_acc = (1 - abs(def_result_summary['Predict'] - def_result_summary[config.targetCol]) / def_result_summary[config.targetCol]) * 100 print(mlKind + " Test Aggreation acc : %.2f " % totol_acc) Data.log.debug(mlKind + ' ' + config.modelFileKey + " Test Aggreation acc : %.2f " % totol_acc) dfraw = pd.read_csv(config.datafile)