def initialize_app(): """ Initializes our Flask application. - creates a Flask app object - sets AWS keys for uploading payloads to S3 - retrieves and sets the application config - integrates with Sentry for error reporting - sets up a background scheduler to refresh teh config every 3,600 seconds - loads the trained model and sets it as a global object """ app = Flask(__name__) if ENVIRONMENT != 'local': sentry_sdk.init(dsn=SENTRY_DSN, integrations=[FlaskIntegration()], traces_sample_rate=1.0) config_dict = retrieve_app_config(DB_SCHEMA, make_mysql_connection(DATABASE_SECRET), ENVIRONMENT) for key, value in config_dict.items(): app.config[key] = value scheduler = BackgroundScheduler() scheduler.add_job(func=hit_config_refresh_endpoint, trigger="interval", seconds=3_600) scheduler.start() global model model = joblib.load(MODEL_PATH) return app
def predict(): """ Endpoint to produce model predictions. Output is logged to S3. """ try: session["endpoint"] = "predict" response_start_time = time.time() input_data = request.json ltv_df = get_client_ltv_table(make_mysql_connection(DATABASE_SECRET)) client_id = input_data.get("client_id", "000000") try: client_ltv = ( ltv_df.loc[ltv_df["client_id"] == client_id])["ltv"].iloc[-1] except IndexError: client_ltv = 0 input_df = convert_json_to_dataframe(input_data) prediction = make_prediction(input_df, model) if prediction >= float(app.config.get("proba_cutoff", 0.75)): high_risk = "yes" else: high_risk = "no" processing_time = round(time.time() - response_start_time, 3) input_data["uid"] = session.get("uid") input_data["url"] = request.url input_data["endpoint"] = "predict" output = dict() output["prediction"] = prediction output["high_risk"] = high_risk output["response_time"] = processing_time output["ltv"] = client_ltv session["output"] = deepcopy(output) session["input"] = input_data print(output) return output except Exception as exception: print(exception) sentry_sdk.capture_exception(exception) output = { "error": "app was not able to process request", "prediction": 0 } return output finally: if ENVIRONMENT != "local": uid = session.get("uid") input_payload = session.get("input") output_payload = session.get("output", {}) output_payload["logging_timestamp"] = str(get_current_timestamp()) output_payload["logging_epoch"] = time.time() log_payload_to_s3(input_payload, output_payload, uid, OUTPUT_LOGS_S3_BUCKET_NAME) log_payloads_to_mysql(input_payload, output_payload, OUTPUT_LOGS_TABLE_NAME, DB_SCHEMA, DATABASE_SECRET)
def main(): """ Loads the csv churn data into a MySQL table. """ db_conn = make_mysql_connection('churn-model-mysql') df = pd.read_csv('data/site_churn_data.csv') dynamically_create_ddl_and_execute(df, 'churn_model', 'churn_data', db_conn) write_dataframe_to_database(df, 'churn_model', 'churn_data', db_conn) sleep(2) validation_df = pd.read_sql('''select * from churn_model.churn_data;''', db_conn) print(validation_df.head())
def config_refresh(): """ Endpoint to refresh the config. This invokes the retrieve_app_config function to query the relevant MySQL table with configuration values. """ config_dict = retrieve_app_config(DB_SCHEMA, make_mysql_connection(DATABASE_SECRET), ENVIRONMENT) for key, value in config_dict.items(): app.config[key] = value return "config refresh hit"
def get_data_to_explore(): """ Tightly-coupled function to retrieve the data we want to explore. """ df = pd.read_sql('''select * from churn_model.churn_data;''', make_mysql_connection('churn-model-mysql')) df['churn'] = np.where(df['churn'].str.startswith('y'), 1, 0) df.drop(['id', 'meta__inserted_at', 'client_id', 'acquired_date'], 1, inplace=True) return df
def query_logs_table(db_secret_name, start_timestamp): """ Queries table of API logs. :param db_secret_name: name of the Secrets Manager secret with the database credentials :param start_timestamp: timestamp for which to pull logs starting at :returns: pandas dataframe """ query = f''' select JSON_EXTRACT(input_output_payloads, "$.output.prediction") as prediction FROM churn_model.model_logs where logging_timestamp >= '{start_timestamp}'; ''' df = pd.read_sql(query, make_mysql_connection(db_secret_name)) return df
def login(): """ Login endpoint for the model user interface. """ if request.method == 'POST': form_submission = request.form username = str(form_submission['username']) password = str(form_submission['password']) hashed_password = sha256(password.encode('utf-8')).hexdigest() database_password = get_hashed_password_for_username( username, make_mysql_connection(DATABASE_SECRET)) if hashed_password == database_password: session['logged_in'] = True return redirect(url_for('model_interface')) else: flash('Credentials are not valid. Please try again.') return render_template('login.html')
def model_interface(): """ Model user interface to render predictions in HTML. """ logged_in = session.get('logged_in', False) if logged_in: if request.method == 'POST': form_submission = request.form raw_clients = str(form_submission['clients']) client_list = raw_clients.split(',') client_list = [str(c) for c in client_list] model_df = get_training_data( make_mysql_connection(DATABASE_SECRET)) model_df = model_df.loc[model_df['client_id'].isin(client_list)] if len(model_df) > 0: model_df.reset_index(inplace=True, drop=True) predictions_df = pd.DataFrame(model.predict_proba(model_df)[:, 1], columns=['prediction']) predictions_df = pd.concat( [model_df[['client_id']], predictions_df], axis=1) client_df = pd.DataFrame({ 'client_id': client_list, 'prediction': ['client_id_not_found'] * len(client_list) }) predictions_df = pd.concat([predictions_df, client_df], axis=0) predictions_df['client_id'] = predictions_df[ 'client_id'].astype(str) predictions_df['client_id'] = predictions_df[ 'client_id'].str.strip() predictions_df = predictions_df.drop_duplicates( subset=['client_id'], keep='first') return render_template('model_interface.html', predictions=predictions_df.to_html( header=True, index=False)) else: return render_template( 'model_interface.html', predictions='None of the passed Client Ids could be found.' ) else: return render_template( 'model_interface.html', predictions='predictions will be rendered here') return redirect(url_for('login'))
def main(target, test_set_percentage, model_training_list, cv_strategy, cv_scoring, static_param_space, class_cutoff, evaluation_list, calibration_bins, drop_col_scorer, drop_col_scorer_string, drop_col_scoring_type, drop_col_higher_is_better, explanation_sample_n, use_shap_kernel, s3_logging_bucket, db_schema_name, log_to_db, db_secret_name): """ Main execution function. :param target: name of the target :param test_set_percentage: percentage of observations for the test set :param model_training_list: list of named tuples containing model configurations; the following tuple elements are required: model_name, model, param_space, iterations :param cv_strategy: cross validation strategy :param cv_scoring: scoring strategy for cross validation :param static_param_space: param space valid for every model :param class_cutoff: probability percentage to be classified in the position class :param target: name of the target :param evaluation_list: list of named tuples containing model evaluation configurations: the following tuple elements are required: evaluation_column, scorer_callable, metric_name :param calibration_bins: list of calibration bins to show :param drop_col_scorer: scikit-learn scoring function for drop col model :param drop_col_scorer_string: scoring metric in the form of a string (e.g. 'neg_log-loss') for drop col model :param drop_col_scoring_type: either class or probability for drop col model :param drop_col_higher_is_better: Boolean of whether or not a higher score is better (e.g. roc auc vs. log loss) for drop col model :param explanation_sample_n: number of observations to include when performing feature explanation :param use_shap_kernel: Boolean of whether or not to use the SHAP kernel explainer :param s3_logging_bucket: S3 bucket in which to store the model output :param db_schema_name: name of the schema for logging model results :param log_to_db: Boolean of whether or not to log results to the database :param db_secret_name: Secrets Manager secret with database credentials """ db_conn = make_mysql_connection(db_secret_name) x_train, x_test, y_train, y_test = create_training_and_testing_data( target, test_set_percentage, db_conn) train_and_evaluate_model( x_train, x_test, y_train, y_test, model_training_list, cv_strategy, cv_scoring, static_param_space, class_cutoff, target, evaluation_list, calibration_bins, drop_col_scorer, drop_col_scorer_string, drop_col_scoring_type, drop_col_higher_is_better, explanation_sample_n, use_shap_kernel, s3_logging_bucket, db_schema_name, db_conn, log_to_db)
def main(model_path, db_secret_name, p_value_cutoff, model_features): """ Determines if concept shift has occurred. :param model_path: path to the model :param db_secret_name: Secrets Manager secret with DB credentials :param p_value_cutoff: p-value for chi-squared calculation :param model_features: features used for modeling """ db_conn = make_mysql_connection(db_secret_name) model_uid = extract_model_uid_from_path(model_path) query_start_time = get_query_start_timestamp(model_uid, db_conn) production_df = extract_production_data(query_start_time, model_uid, db_conn) original_training_df = recreate_data_used_for_training(model_uid, model_features) cat_production_df = production_df.select_dtypes(include='object') num_production_df = production_df.select_dtypes(exclude='object') cat_training_df = original_training_df.select_dtypes(include='object') num_training_df = original_training_df.select_dtypes(exclude='object') cat_columns = set(list(cat_production_df) + list(cat_training_df)) num_columns = set(list(num_production_df) + list(num_training_df)) main_drift_df = pd.DataFrame() for cat_col in cat_columns: temp_chi_squared_df = prep_category_for_chi_squared(cat_training_df, cat_production_df, cat_col) p_value = calculate_chi_squared_statistic(temp_chi_squared_df['train_count'], temp_chi_squared_df['prod_count']) temp_drift_df = pd.DataFrame({'feature': [cat_col], 'p_value': [p_value]}) main_drift_df = main_drift_df.append(temp_drift_df) for num_col in num_columns: p_value = calculate_ks_statistic(num_training_df[num_col], num_production_df[num_col]) temp_drift_df = pd.DataFrame({'feature': [num_col], 'p_value': [p_value]}) main_drift_df = main_drift_df.append(temp_drift_df) main_drift_df['shift_occurred'] = np.where(main_drift_df['p_value'] <= p_value_cutoff, True, False) main_drift_df['p_value_cutoff'] = p_value_cutoff db.write_dataframe_to_database(main_drift_df, 'churn_model', 'data_shift', db_conn)