def solution_saving(df, sel_model, client_lvl_cols_in, client_lvl_sels): truncate_query_part_2 = ' '.join([ 'and {} = \'{}\''.format(x, y) for x, y in zip(client_lvl_cols_in, client_lvl_sels) if y != '-' ]) df = client_replacement( df, client_lvl_cols_in, client_lvl_sels ) # Replaces the values of Client's Levels by the actual values selected for this solution level_1_e_deployment.sql_truncate( options_file.DSN_SRV3_PRD, options_file, options_file.sql_info['database_source'], options_file.sql_info['optimization_solution_table'], query=truncate_query.format(sel_model) + truncate_query_part_2) level_1_e_deployment.sql_inject( df, options_file.DSN_SRV3_PRD, options_file.sql_info['database_source'], options_file.sql_info['optimization_solution_table'], options_file, configuration_parameters + client_lvl_cols_in + ['Quantity', 'Average_Score_Euros', 'ML_VehicleData_Code'], check_date=1) st.write('Sugestão gravada com sucesso.') return
def deployment(df, db, view): performance_info_append(time.time(), 'Section_E_Start') log_record('Início Secção E...', project_id) if df is not None: df['NLR_Code'] = level_2_optionals_cdsu_options.nlr_code # df = column_rename(df, list(level_2_optionals_cdsu_options.column_sql_renaming.keys()), list(level_2_optionals_cdsu_options.column_sql_renaming.values())) df = df.rename( columns=level_2_optionals_cdsu_options.column_sql_renaming) control_prints(df, 'before deployment, after renaming', head=1) sql_delete( level_2_optionals_cdsu_options.DSN_MLG_PRD, db, view, level_2_optionals_cdsu_options, {'NLR_Code': '{}'.format(level_2_optionals_cdsu_options.nlr_code)}) sql_inject(df, level_2_optionals_cdsu_options.DSN_MLG_PRD, db, view, level_2_optionals_cdsu_options, list(level_2_optionals_cdsu_options. column_checkpoint_sql_renaming.values()), check_date=1) log_record('Fim Secção E.', project_id) performance_info_append(time.time(), 'Section_E_End') return
def sql_upload(df, db, view): df['Totals'] = df.sum(axis=1) df.index.rename('Actual', inplace=True) df.reset_index(inplace=True) sql_inject(df, options_file.DSN_MLG_PRD, db, view, options_file, list(df), truncate=1, check_date=1) return
def update_family(df, new_family_classification, df_product_group): new_family_classification_code = family_code_convertion(new_family_classification, df_product_group) df['New_Product_Group_DW'] = new_family_classification_code df.rename(columns={'Product_Group_DW': 'Old_Product_Group_DW'}, inplace=True) level_1_e_deployment.sql_inject(df, options_file.DSN_MLG_PRD, options_file.sql_info['database_final'], options_file.sql_info['parts_classification_refs'], options_file, ['Part_Ref', 'Part_Description', 'Part_Cost', 'Part_PVP', 'Client_ID', 'Old_Product_Group_DW', 'New_Product_Group_DW', 'Classification', 'Classification_Prob'], check_date=1) st.write('Famílias das referências selecionadas alteradas com sucesso.') return
def deployment(df): performance_info_append(time.time(), 'Section_E_Start') log_record('Início Secção E...', options_file.project_id) df = df.astype(object).where(pd.notnull(df), None) sql_inject(df, options_file.DSN_SRV3_PRD, options_file.sql_info['database_source'], options_file.sql_info['final_table'], options_file, ['Request_Num', 'StemmedDescription', 'Description', 'Language', 'Open_Date', 'Label', 'Classification_Flag'], truncate=1) log_record('Fim Secção E.', options_file.project_id) performance_info_append(time.time(), 'Section_E_End') return
def model_performance_saving(df, options_file): level_1_e_deployment.sql_inject( df, level_0_performance_report.DSN_MLG_PRD, level_0_performance_report.performance_sql_info['DB'], level_0_performance_report. performance_sql_info['performance_algorithm_results'], options_file, list(df), check_date=1) return
def deployment(df, main_families_cm, other_families_cm): sql_upload(main_families_cm, options_file.sql_info['database_final'], options_file.sql_info['matrix_lvl_1']) sql_upload(other_families_cm, options_file.sql_info['database_final'], options_file.sql_info['matrix_lvl_2']) df.rename(columns={'Client_Id': 'Client_ID', 'Part_Desc_concat': 'Part_Description', 'Average_Cost_avg': 'Part_Cost', 'PVP_1_avg': 'Part_PVP', 'prediction': 'Classification', 'Max_Prob': 'Classification_Prob'}, inplace=True) df['Classification_Flag'] = 0 df['Classification_Prob'] = df['Classification_Prob'].round(2) df['Part_Cost'] = df['Part_Cost'].round(2) df['Part_PVP'] = df['Part_PVP'].round(2) df = df.astype({'Part_Ref': 'str', 'Client_ID': 'str', 'Part_Cost': 'str', 'Part_PVP': 'str', 'Classification_Prob': 'str'}) df['Part_Description'] = df['Part_Description'].fillna("") df.dropna(subset=['Classification'], axis=0, inplace=True) sql_inject(df, options_file.DSN_MLG_PRD, options_file.sql_info['database_final'], options_file.sql_info['parts_classification_table'], options_file, columns=['Part_Ref', 'Part_Description', 'Part_Cost', 'Part_PVP', 'Client_ID', 'Product_Group_DW', 'Classification', 'Classification_Prob', 'Classification_Flag'], truncate=1, check_date=1) sql_inject(df, options_file.DSN_SRV3_PRD, options_file.sql_info['database_BI_GSC'], options_file.sql_info['parts_classification_table'], options_file, columns=['Part_Ref', 'Part_Description', 'Part_Cost', 'Part_PVP', 'Client_ID', 'Product_Group_DW', 'Classification', 'Classification_Prob', 'Classification_Flag'], truncate=1, check_date=1) sql_sp_run(options_file.DSN_SRV3_PRD, options_file.sql_info['database_BI_GSC'], options_file) return
def deployment(df, db, view): performance_info_append(time.time(), 'Section_E_Start') log_record('Início Secção E...', project_id) for col in list(df): df[col] = df[col].astype(str) df['NLR_Code'] = level_2_optionals_baviera_options.nlr_code if df is not None: df = column_rename( df, list(level_2_optionals_baviera_options.column_sql_renaming.keys()), list(level_2_optionals_baviera_options.column_sql_renaming.values( ))) if model_training_check: sql_delete( level_2_optionals_baviera_options.DSN_MLG_PRD, db, view, level_2_optionals_baviera_options, { 'NLR_Code': '{}'.format(level_2_optionals_baviera_options.nlr_code) }) sql_inject(df, level_2_optionals_baviera_options.DSN_MLG_PRD, db, view, level_2_optionals_baviera_options, level_2_optionals_baviera_options.columns_for_sql, check_date=1) else: sql_delete( level_2_optionals_baviera_options.DSN_MLG_PRD, db, view, level_2_optionals_baviera_options, { 'NLR_Code': '{}'.format(level_2_optionals_baviera_options.nlr_code) }) sql_inject(df, level_2_optionals_baviera_options.DSN_MLG_PRD, db, view, level_2_optionals_baviera_options, level_2_optionals_baviera_options.columns_for_sql_temp, check_date=1) log_record('Fim Secção E.', project_id) performance_info_append(time.time(), 'Section_E_End') return
def deployment(df, db, view): performance_info_append(time.time(), 'Section_E_Start') log_record('Início Secção E...', options_file.project_id) if df is not None: sel_df = df.loc[:, options_file.sql_columns_vhe_fact_bi].copy() sel_df['NLR_Posting_Date'] = sel_df['NLR_Posting_Date'].astype( object).where(sel_df['NLR_Posting_Date'].notnull(), None) sel_df['SLR_Document_Date_CHS'] = sel_df[ 'SLR_Document_Date_CHS'].astype(object).where( sel_df['SLR_Document_Date_CHS'].notnull(), None) sel_df['SLR_Document_Date_RGN'] = sel_df[ 'SLR_Document_Date_RGN'].astype(object).where( sel_df['SLR_Document_Date_RGN'].notnull(), None) sel_df['Ship_Arrival_Date'] = sel_df['Ship_Arrival_Date'].astype( object).where(sel_df['Ship_Arrival_Date'].notnull(), None) sel_df['Registration_Request_Date'] = sel_df[ 'Registration_Request_Date'].astype(object).where( sel_df['Registration_Request_Date'].notnull(), None) sel_df['Registration_Date'] = sel_df['Registration_Date'].astype( object).where(sel_df['Registration_Date'].notnull(), None) sel_df['PDB_Start_Order_Date'] = sel_df['PDB_Start_Order_Date'].astype( object).where(sel_df['PDB_Start_Order_Date'].notnull(), None) sel_df['PDB_End_Order_Date'] = sel_df['PDB_End_Order_Date'].astype( object).where(sel_df['PDB_End_Order_Date'].notnull(), None) sel_df['Fixed_Margin_II'] = sel_df['Fixed_Margin_II'].round(2) sel_df = sel_df.where(sel_df.notnull(), None) sel_df.rename(columns={ 'prev_sales_check': 'Previous_Sales_Flag', 'number_prev_sales': 'Previous_Sales_Count' }, inplace=True) sql_inject(sel_df, options_file.DSN_SRV3_PRD, db, view, options_file, list(sel_df), truncate=1, check_date=1) log_record('Fim Secção E.', options_file.project_id) performance_info_append(time.time(), 'Section_E_End')
def solution_saving(df_solution, group_name, group_name_original): level_1_e_deployment.sql_truncate( options_file.DSN_MLG_PRD, options_file, options_file.sql_info['database_final'], options_file.sql_info['optimization_solution_table'], query=truncate_query.format( options_file.sql_info['optimization_solution_table'], group_name)) level_1_e_deployment.sql_inject( df_solution, options_file.DSN_MLG_PRD, options_file.sql_info['database_final'], options_file.sql_info['optimization_solution_table'], options_file, list(df_solution[options_file.columns_sql_solver_solution]), check_date=1) st.write('Sugestão gravada com sucesso - {}'.format(group_name_original)) return
def deployment(df_solver, df_part_ref_ta, pse_code): performance_info_append(time.time(), 'Section_E_Start') log_record('Início Secção E...', options_file.project_id) df_solver = column_rename(df_solver, list(options_file.column_sql_renaming.keys()), list(options_file.column_sql_renaming.values())) df_solver = df_solver.dropna(subset=[options_file.column_sql_renaming['Group']]) df_solver['Cost'] = pd.to_numeric(df_solver['Cost'], errors='coerce') df_solver.dropna(axis=0, subset=['Cost'], inplace=True) df_part_ref_ta = column_rename(df_part_ref_ta, ['Group'], [options_file.column_sql_renaming['Group']]) sql_truncate(options_file.DSN_MLG_DEV, options_file, options_file.sql_info['database_final'], options_file.sql_info['final_table'], query=options_file.truncate_table_query.format(options_file.sql_info['final_table'], pse_code)) sql_inject(df_solver, options_file.DSN_MLG_DEV, options_file.sql_info['database_final'], options_file.sql_info['final_table'], options_file, columns=list(options_file.column_sql_renaming.values()), check_date=1) sql_truncate(options_file.DSN_MLG_DEV, options_file, options_file.sql_info['database_final'], options_file.sql_info['ta_table'], query=options_file.truncate_table_query.format(options_file.sql_info['ta_table'], pse_code)) df_part_ref_ta.dropna(subset=['Part_Ref_Group_Desc'], inplace=True) sql_inject(df_part_ref_ta, options_file.DSN_MLG_DEV, options_file.sql_info['database_final'], options_file.sql_info['ta_table'], options_file, columns=list(df_part_ref_ta), check_date=1) log_record('Fim Secção E.', options_file.project_id) performance_info_append(time.time(), 'Section_E_End') return
def save_classification_rule(df_product_group, text, text_option, sel_family_sel_overwrite, sel_cost_max, max_cost, sel_cost_min, min_cost, sel_pvp_max, max_pvp, sel_pvp_min, min_pvp): family_code = family_code_convertion(sel_family_sel_overwrite, df_product_group) time_tag, _ = level_1_e_deployment.time_tags(format_date="%Y%m%d") # st.write(text, text_option, family_code, sel_cost_max, max_cost, sel_cost_min, min_cost, sel_pvp_max, max_pvp, sel_pvp_min, min_pvp, time_tag) df_rules = pd.DataFrame() df_rules['Matching_Rule'] = [text_option] df_rules['Word'] = text df_rules['Product_Group_DW'] = family_code df_rules['Sel_Max_Cost'] = sel_cost_max df_rules['Max_Cost'] = max_cost df_rules['Sel_Min_Cost'] = sel_cost_min df_rules['Min_Cost'] = min_cost df_rules['Sel_Max_PVP'] = sel_pvp_max df_rules['Max_PVP'] = max_pvp df_rules['Sel_Min_PVP'] = sel_pvp_min df_rules['Min_PVP'] = min_pvp df_rules['Date'] = time_tag level_1_e_deployment.sql_inject(df_rules, options_file.DSN_MLG_PRD, options_file.sql_info['database_final'], options_file.sql_info['parts_classification_rules'], options_file, columns=list(df_rules)) return
def model_choice_upload(flag, name, value, options_file): df_model_result = pd.DataFrame(columns={ 'Model_Choice_Flag', 'Chosen_Model', 'Metric', 'Value', 'Message' }) message = None df_model_result['Model_Choice_Flag'] = [flag] df_model_result['Project_Id'] = [options_file.project_id] if not flag: message = 'Nenhum dos modelos treinados atinge os valores mínimos definidos.' df_model_result['Chosen_Model'] = [0] df_model_result['Metric'] = [0] df_model_result['Value'] = [0] elif flag: if flag == 1: message = 'Modelo anterior com melhor performance do que o atual.' if flag == 2: message = 'Modelo anterior substituído pelo atual.' if flag == 3: message = 'Modelo anterior substituído pelo atual, com pequenas variações de performance.' if flag == 4: message = 'Novo modelo com performance igual ao anterior.' df_model_result['Chosen_Model'] = [name] df_model_result['Metric'] = [options_file.metric] df_model_result['Value'] = [value] df_model_result['Message'] = [message] level_1_e_deployment.sql_inject( df_model_result, options_file.DSN_MLG_PRD, level_0_performance_report.performance_sql_info['DB'], level_0_performance_report.performance_sql_info['model_choices'], options_file, list(df_model_result), check_date=1) return message
def performance_evaluation_classification(models, best_models, running_times, datasets, options_file, project_id): # models -> list with models names; # best_models -> dict with models name as key and the best clf after gridsearch as values; # classes -> clf.classes_ # running_times -> dict with models name as key and the training time as values # datasets -> dict with the datasets required - train_x, test_x, train_y, test_y results_train, results_test = [], [] predictions, feat_importance = {}, pd.DataFrame(index=list( datasets['train_x']), columns={'Importance'}) for model in models: prediction_train = best_models[model].predict(datasets['train_x']) prediction_test = best_models[model].predict(datasets['test_x']) evaluation_training = ClassificationEvaluation( groundtruth=datasets['train_y'], prediction=prediction_train) evaluation_test = ClassificationEvaluation( groundtruth=datasets['test_y'], prediction=prediction_test) predictions[model] = [ prediction_train.astype(int, copy=False), prediction_test.astype(int, copy=False) ] # plot_conf_matrix(datasets['train_y'], prediction_train, classes, model, project_id) # plot_conf_matrix(datasets['test_y'], prediction_test, classes, model, project_id) try: feat_importance['Importance'] = best_models[ model].feature_importances_ feat_importance.sort_values(by='Importance', ascending=False, inplace=True) feat_importance.to_csv(base_path + '/output/' + 'feature_importance_' + str(model) + '.csv') except AttributeError: pass row_train = { 'Micro_F1': getattr(evaluation_training, 'micro'), 'Average_F1': getattr(evaluation_training, 'average'), 'Macro_F1': getattr(evaluation_training, 'macro'), 'Accuracy': getattr(evaluation_training, 'accuracy'), 'ROC_Curve': getattr(evaluation_training, 'roc_auc_curve'), # ('Precision_Class_' + str(classes[0])): getattr(evaluation_training, 'precision')[0], ('Precision_Class_' + str(best_models[model].classes_[0])): getattr(evaluation_training, 'precision')[0], ('Precision_Class_' + str(best_models[model].classes_[1])): getattr(evaluation_training, 'precision')[1], ('Recall_Class_' + str(best_models[model].classes_[0])): getattr(evaluation_training, 'recall')[0], ('Recall_Class_' + str(best_models[model].classes_[1])): getattr(evaluation_training, 'recall')[1], # 'Precision_Micro_Class_' + str(classes[0]): getattr(evaluation_training, 'precision_multiclass_micro')[0], # 'Precision_Micro_Class_' + str(classes[1]): getattr(evaluation_training, 'precision_multiclass_micro')[1], # 'Precision_Macro_Class_' + str(classes[0]): getattr(evaluation_training, 'precision_multiclass_macro')[0], # 'Precision_Macro_Class_' + str(classes[1]): getattr(evaluation_training, 'precision_multiclass_macro')[1], # 'Precision_Average_Class_' + str(classes[0]): getattr(evaluation_training, 'precision_multiclass_average')[0], # 'Precision_Average_Class_' + str(classes[1]): getattr(evaluation_training, 'precision_multiclass_average')[1], # # 'Recall_Micro_Class_' + str(classes[0]): getattr(evaluation_training, 'recall_multiclass_micro')[0], # 'Recall_Micro_Class_' + str(classes[1]): getattr(evaluation_training, 'recall_multiclass_micro')[1], # 'Recall_Macro_Class_' + str(classes[0]): getattr(evaluation_training, 'recall_multiclass_macro')[0], # 'Recall_Macro_Class_' + str(classes[1]): getattr(evaluation_training, 'recall_multiclass_macro')[1], # 'Recall_Average_Class_' + str(classes[0]): getattr(evaluation_training, 'recall_multiclass_average')[0], # 'Recall_Average_Class_' + str(classes[1]): getattr(evaluation_training, 'recall_multiclass_average')[1], 'Running_Time': running_times[model] } row_test = { 'Micro_F1': getattr(evaluation_test, 'micro'), 'Average_F1': getattr(evaluation_test, 'average'), 'Macro_F1': getattr(evaluation_test, 'macro'), 'Accuracy': getattr(evaluation_test, 'accuracy'), 'ROC_Curve': getattr(evaluation_test, 'roc_auc_curve'), ('Precision_Class_' + str(best_models[model].classes_[0])): getattr(evaluation_test, 'precision')[0], ('Precision_Class_' + str(best_models[model].classes_[1])): getattr(evaluation_test, 'precision')[1], ('Recall_Class_' + str(best_models[model].classes_[0])): getattr(evaluation_test, 'recall')[0], ('Recall_Class_' + str(best_models[model].classes_[1])): getattr(evaluation_test, 'recall')[1], # 'Precision_Micro_Class_' + str(classes[0]): getattr(evaluation_test, 'precision_multiclass_micro')[0], # 'Precision_Micro_Class_' + str(classes[1]): getattr(evaluation_test, 'precision_multiclass_micro')[1], # 'Precision_Macro_Class_' + str(classes[0]): getattr(evaluation_test, 'precision_multiclass_macro')[0], # 'Precision_Macro_Class_' + str(classes[1]): getattr(evaluation_test, 'precision_multiclass_macro')[1], # 'Precision_Average_Class_' + str(classes[0]): getattr(evaluation_test, 'precision_multiclass_average')[0], # 'Precision_Average_Class_' + str(classes[1]): getattr(evaluation_test, 'precision_multiclass_average')[1], # # 'Recall_Micro_Class_' + str(classes[0]): getattr(evaluation_test, 'recall_multiclass_micro')[0], # 'Recall_Micro_Class_' + str(classes[1]): getattr(evaluation_test, 'recall_multiclass_micro')[1], # 'Recall_Macro_Class_' + str(classes[0]): getattr(evaluation_test, 'recall_multiclass_macro')[0], # 'Recall_Macro_Class_' + str(classes[1]): getattr(evaluation_test, 'recall_multiclass_macro')[1], # 'Recall_Average_Class_' + str(classes[0]): getattr(evaluation_test, 'recall_multiclass_average')[0], # 'Recall_Average_Class_' + str(classes[1]): getattr(evaluation_test, 'recall_multiclass_average')[1], 'Running_Time': running_times[model] } results_train.append(row_train) results_test.append(row_test) df_results_train = pd.DataFrame(results_train, index=models) df_results_train['Algorithms'] = df_results_train.index df_results_train['Dataset'] = ['Train'] * df_results_train.shape[0] df_results_train['Project_Id'] = [project_id] * df_results_train.shape[0] df_results_test = pd.DataFrame(results_test, index=models) df_results_test['Algorithms'] = df_results_test.index df_results_test['Dataset'] = ['Test'] * df_results_train.shape[0] df_results_test['Project_Id'] = [project_id] * df_results_train.shape[0] metric_bar_plot(df_results_train, 'project_{}_train_dataset'.format(project_id)) metric_bar_plot(df_results_test, 'project_{}_test_dataset'.format(project_id)) model_performance_saving(pd.concat([df_results_train, df_results_test]), options_file) level_1_e_deployment.sql_inject( pd.concat([df_results_train, df_results_test]), level_0_performance_report.performance_sql_info['DSN'], level_0_performance_report.performance_sql_info['DB'], level_0_performance_report. performance_sql_info['performance_algorithm_results'], options_file, list(df_results_train), check_date=1) return df_results_train, df_results_test, predictions
def feature_contribution(df, configuration_parameters, col_to_group_by, options_file, project_id): configuration_parameters.remove(col_to_group_by) boolean_parameters = [ x for x in configuration_parameters if list(df[x].unique()) == [0, 1] or list(df[x].unique()) == [1, 0] ] non_boolean_parameters = [ x for x in configuration_parameters if x not in boolean_parameters ] df_feature_contribution_total = pd.DataFrame() for model in df[col_to_group_by].unique(): model_mask = df[col_to_group_by] == model df_model = df.loc[df[model_mask].index, :] mask_class_1 = df_model['score_class_gt'] == 1 mask_class_0 = df_model['score_class_gt'] == 0 class_1 = df_model.loc[df_model[mask_class_1].index, :] class_0 = df_model.loc[df_model[mask_class_0].index, :] differences_boolean, differences_non_boolean, features_boolean, features_non_boolean = [], [], [], [] differences_feature, features, model_tag = [], [], [] for feature in configuration_parameters: if feature in boolean_parameters: c1_f1 = class_1.loc[class_1[feature] == 1, :].shape[0] c1_f0 = class_1.loc[class_1[feature] == 0, :].shape[0] c0_f1 = class_0.loc[class_0[feature] == 1, :].shape[0] c0_f0 = class_0.loc[class_0[feature] == 0, :].shape[0] f1 = c1_f1 + c0_f1 f0 = c1_f0 + c0_f0 try: p_c1_f1 = c1_f1 / f1 * 1. p_c1_f0 = c1_f0 / f0 * 1. differences_boolean.append(p_c1_f1 - p_c1_f0) features_boolean.append(feature + '_sim') except ZeroDivisionError: continue elif feature in non_boolean_parameters: for value in df_model[feature].unique(): if value == 'outros': continue c1_f1 = class_1.loc[class_1[feature] == value, :].shape[0] c1_f0 = class_1.loc[class_1[feature] != value, :].shape[0] c0_f1 = class_0.loc[class_0[feature] == value, :].shape[0] c0_f0 = class_0.loc[class_0[feature] != value, :].shape[0] # ToDo: There might be cases where only one value for a feature is available if the df is too small (Only Preto as Cor_Interior, e.g.). I should add a try/exception to catch these for the conditions there feature != value f1 = c1_f1 + c0_f1 f0 = c1_f0 + c0_f0 try: p_c1_f1 = c1_f1 / f1 * 1. p_c1_f0 = c1_f0 / f0 * 1. except ZeroDivisionError: # log_record('Insufficient data for feature ' + str(feature) + ' and value ' + str(value) + '.', project_id, flag=1) level_0_performance_report.log_record( 'Dados insuficientes para a feature {} com valor {}.' .format(feature, value), project_id, flag=1) continue differences_non_boolean.append(p_c1_f1 - p_c1_f0) features_non_boolean.append(feature + '_' + value) differences_feature.extend(differences_boolean) differences_feature.extend(differences_non_boolean) features.extend(features_boolean) features.extend(features_non_boolean) model_tag.extend( [model] * (len(differences_boolean) + len(differences_non_boolean))) df_feature_contribution = pd.DataFrame() df_feature_contribution['Features'] = features df_feature_contribution['Differences'] = differences_feature df_feature_contribution['Model_Code'] = model_tag if abs(df_feature_contribution['Differences'].min() ) > df_feature_contribution['Differences'].max(): max_range_value = abs(df_feature_contribution['Differences'].min()) min_range_value = df_feature_contribution['Differences'].min() else: max_range_value = df_feature_contribution['Differences'].max() min_range_value = df_feature_contribution['Differences'].max() * -1 df_feature_contribution[ 'Differences_Normalized'] = 2 * df_feature_contribution[ 'Differences'] / (max_range_value - min_range_value) df_feature_contribution_total = pd.concat( [df_feature_contribution_total, df_feature_contribution]) level_1_e_deployment.sql_inject( df_feature_contribution_total, options_file.DSN_MLG_PRD, options_file.sql_info['database_final'], options_file.sql_info['feature_contribution'], options_file, list(df_feature_contribution_total), truncate=1)
def performance_evaluation_regression(models, best_models, running_times, datasets, datasets_non_ohe, options_file, project_id): results_train, results_test = [], [] predictions, feat_importance = {}, pd.DataFrame(index=list( datasets['train_x']), columns={'Importance'}) for model in models: if model == 'lgb': train_x, test_x = datasets_non_ohe['train_x'], datasets_non_ohe[ 'test_x'] train_y, test_y = datasets_non_ohe['train_y'], datasets_non_ohe[ 'test_y'] else: train_x, test_x = datasets['train_x'], datasets['test_x'] train_y, test_y = datasets['train_y'], datasets['test_y'] prediction_train = best_models[model].predict(train_x) prediction_test = best_models[model].predict(test_x) evaluation_training = RegressionEvaluation(groundtruth=train_y, prediction=prediction_train) evaluation_test = RegressionEvaluation(groundtruth=test_y, prediction=prediction_test) predictions[model] = [ prediction_train.astype(int, copy=False), prediction_test.astype(int, copy=False) ] # try: # feat_importance['Importance'] = best_models[model].feature_importances_ # feat_importance.sort_values(by='Importance', ascending=False, inplace=True) # feat_importance.to_csv(base_path + '/output/' + 'feature_importance_' + str(model) + '.csv') # except AttributeError: # pass row_train = { 'R2': getattr(evaluation_training, 'r2_score'), 'MSE': getattr(evaluation_training, 'mse'), 'RMSE': np.sqrt(getattr(evaluation_training, 'mse')), 'Running_Time': running_times[model] } row_test = { 'R2': getattr(evaluation_test, 'r2_score'), 'MSE': getattr(evaluation_test, 'mse'), 'RMSE': np.sqrt(getattr(evaluation_test, 'mse')), 'Running_Time': running_times[model] } results_train.append(row_train) results_test.append(row_test) df_results_train = pd.DataFrame(results_train, index=models) df_results_train['Algorithms'] = df_results_train.index df_results_train['Dataset'] = ['Train'] * df_results_train.shape[0] df_results_train['Project_Id'] = [project_id] * df_results_train.shape[0] df_results_test = pd.DataFrame(results_test, index=models) df_results_test['Algorithms'] = df_results_test.index df_results_test['Dataset'] = ['Test'] * df_results_train.shape[0] df_results_test['Project_Id'] = [project_id] * df_results_train.shape[0] # metric_bar_plot(df_results_train, 'project_{}_train_dataset'.format(project_id)) # metric_bar_plot(df_results_test, 'project_{}_test_dataset'.format(project_id)) level_1_e_deployment.sql_inject( pd.concat([df_results_train, df_results_test]), level_0_performance_report.performance_sql_info['DSN'], level_0_performance_report.performance_sql_info['DB'], level_0_performance_report. performance_sql_info['performance_algorithm_results'], options_file, list(df_results_train), check_date=1) return df_results_train, df_results_test, predictions