import data_preprocessing import feature_engineering import eda_monitoring import modeling import performance_monitoring from prefect import Flow, task, context import pandas as pd # Pandas options for better shell display pd.set_option("display.max_rows", 100) pd.set_option("display.max_columns", None) pd.set_option("display.width", None) start_time = greenhouse_clock.get_time() @task def sourcing(): return data_sourcing.get() @task def cleansing(df): return data_preprocessing.clean(df) @task
import numpy as np from sklearn import metrics import json import greenhouse_clock meta = {} # Timestamp for files meta["timestr"] = greenhouse_clock.get_time() def optimal_threshold(y_true, y_score): # Performance extracted from the "ROC curve" fpr, tpr, thr = metrics.roc_curve( y_true=y_true, y_score=y_score, pos_label=1, drop_intermediate=False ) diff = np.abs(tpr - fpr) # Numpy index of the maximum separation between TPR and FPR diff_idx = np.argmax(diff) # Optimum threshold based on max diff criterium return thr[diff_idx] def report_performance( y_true, y_score, best_hyperparams, path, opt_thr=0.5, suffix="_" ):