def shap_explain_xgb(data_set_path, plot_n=None): pickle_path = data_set_path.replace('.csv', '_xgb_model.pkl') assert os.path.isfile( pickle_path ), 'You must run listing 9.6 to save an XGB regression model first' with open(pickle_path, 'rb') as fid: xgb_model = pickle.load(fid) current_df = reload_churn_data(data_set_path, 'current', '8.3', is_customer_data=True) explainer = shap.TreeExplainer(xgb_model, feature_perturbation="interventional", model_output='probability', data=current_df) shap_values = explainer(current_df) shap.summary_plot(shap_values, current_df, show=False) save_file = data_set_path.replace('.csv', '_shap_summary_xgb.png') print(f'Saving SHAP Explanation to {save_file}') plt.tight_layout() plt.savefig(save_file, format='png') plt.close() if plot_n is not None: for n in plot_n: shap.waterfall_plot(shap_values[n], show=False) save_file = data_set_path.replace('.csv', f'_shap_water_xgb_{n}.png') plt.tight_layout() plt.savefig(save_file, format='png') plt.close()
def shapley_tree(model_predict, obs, dataset, column_names, plot_draw=False): explainer = shap.KernelExplainer(model_predict, shap.sample(dataset, 100)) shap_values = explainer.shap_values(obs) if plot_draw: shap.waterfall_plot(explainer.expected_value, shap_values, feature_names=column_names) return shap_values, explainer.expected_value
def shapley_diff(model, obs, dataset, column_names, treatment_col, plot_draw=True): shap_t0, exp0 = shapley_tree(predict_treatment(model, treatment_col, 0), obs, dataset, column_names) shap_t1, exp1 = shapley_tree(predict_treatment(model, treatment_col, 1), obs, dataset, column_names) if plot_draw: shap.waterfall_plot(exp1 - exp0, shap_t1 - shap_t0, feature_names=column_names) return shap_t1 - shap_t0, exp1 - exp0
def waterfall_plot(self, row_idx=None, class_id=0, **kwargs): "Plots explaination of single prediction as waterfall plot" shap_vals, exp_val = _get_values(self, class_id) n_rows = shap_vals.shape[0] row_idx = random.randint(0, n_rows-1) if row_idx is None else row_idx print(f'Displaying row {row_idx} of {n_rows} (use `row_idx` to specify another row)') feat_names = self.test_data.columns return shap.waterfall_plot(exp_val, shap_vals[row_idx,:], feature_names=feat_names, **kwargs)
def waterfall_plot(self, row_index=None, class_id=0, **kwargs): """ Plots an explanation of a single prediction as a waterfall plot. `row_index` is the index of the row in `test_data` that will be analyzed, if it is None it will be drawed at random `class_id` is used to indicate the class of interest for classification models, it can ba an int or a string For an up-to-date list of the parameters, see: https://github.com/slundberg/shap/blob/master/shap/plots/waterfall.py """ shap_values, expected_value = _get_values(self, class_id) nb_rows = shap_values.shape[0] row_index = random.randint(0,nb_rows-1) if row_index is None else row_index print("Displaying row", row_index, "of", nb_rows, "(use `row_index` to specify another row)") feature_names = self.test_data.columns return shap.waterfall_plot(expected_value, shap_values[row_index,:], feature_names=feature_names, **kwargs)
# ax.set_ylim([-0.2,0.2]) ax.set_title(feat) ind+=1 plt.subplots_adjust(hspace=0.8) plt.savefig('shap_sc.png') **Decision_plot()** is interesting as it shows how the prediction is formed from the contributions of different features. shap.decision_plot(explainerXGB.expected_value,shap_values_XGB_test[0:100],features) **Force_plot** is similar to decision_plot. We plot only the first 100 instances because it would be very slow to draw a force_plot with all the instances. shap.force_plot(explainerXGB.expected_value,shap_values_XGB_test[0:100],features,figsize=(20,10)) **Waterfall_plot** is great when you want to analyse one instance. shap.waterfall_plot(explainerXGB.expected_value,shap_values_XGB_test[2000],x_df.iloc[2000],features) ### Other interpretation methods For the following methods, we need to use the Xgboost's Scikit-learn wrapper **XGBRegressor()** to make our Xgboost model to be compatible with the Scikit-learn ecosystem. m_depth = 5 eta = 0.1 ssample = 0.8 col_tree = 0.8 m_child_w = 3 gam = 1. objective = 'reg:squarederror' param = {'max_depth': m_depth, 'eta': eta, 'subsample': ssample, 'colsample_bytree': col_tree, 'min_child_weight' : m_child_w, 'gamma' : gam,'objective' : objective}
interpreter.feat_scores[pred, sample].shape len(interpreter.feat_scores[pred, sample].shape) interpreter.feat_scores[pred, sample] model(interpreter.test_data[pred, sample, 2:].unsqueeze(0).unsqueeze(0)) np.sum(interpreter.feat_scores[pred, sample]) + interpreter.explainer.expected_value[0] interpreter.feat_names interpreter.test_data[pred, sample, :].numpy() shap.waterfall_plot(interpreter.explainer.expected_value[0], interpreter.feat_scores[pred, sample], features=interpreter.test_data[pred, sample, 2:].numpy(), feature_names=interpreter.feat_names) shap.waterfall_plot(interpreter.explainer.expected_value[0], interpreter.feat_scores[pred, sample], features=interpreter.test_data[pred, sample, 2:].numpy(), feature_names=interpreter.feat_names, max_display=2) # du.visualization.shap_waterfall_plot(interpreter.explainer.expected_value[0], interpreter.feat_scores[pred, sample], du.visualization.shap_waterfall_plot(0, interpreter.feat_scores[pred, sample], interpreter.test_data[pred, sample, 2:], interpreter.feat_names, max_display=2) # + fig = go.Figure()
def model_interpretation(self, patient_id, patient_preprocessed, pred, prob, model): ''' Fazer gráficos avaliativos do modelo. Argumentos: patient_id = string referente a identificação do paciente patient_preprocessed = dicionario contendo dados do exame do paciente pred = classe predita pelo modelo prob = probabilidade referente a classe predita pelo modelo model = objeto do modelo ''' #### Pegar variaveis necessárias para o plot (import csv) #### Nome dos plots plot_1_name = 'app/ai_models/temp/probacurve-' + str( patient_id) + '.png' plot_2_name = 'app/ai_models/temp/shap-' + str(patient_id) + '.png' plot_3_name = 'app/ai_models/temp/dist-' + str(patient_id) + '.png' plot_4_name = 'app/ai_models/temp/mapa-' + str(patient_id) + '.png' #URL API PLOTS plot_1_api = "http://" + self.IP + ":" + self.API_PORT + "/api/media/probacurve-" + str( patient_id) + ".png" plot_2_api = "http://" + self.IP + ":" + self.API_PORT + "/api/media/shap-" + str( patient_id) + ".png" plot_3_api = "http://" + self.IP + ":" + self.API_PORT + "/api/media/dist-" + str( patient_id) + ".png" plot_4_api = "http://" + self.IP + ":" + self.API_PORT + "/api/media/mapa-" + str( patient_id) + ".png" #### Configurações gerais do plt DPI_IMAGES = 100 FONT_SIZE = 8 FONT_NAME = 'sans-serif' plt.rc('font', family=FONT_NAME, size=FONT_SIZE) plt.rc('axes', titlesize=FONT_SIZE, labelsize=FONT_SIZE) plt.rc('xtick', labelsize=FONT_SIZE) plt.rc('ytick', labelsize=FONT_SIZE) plt.rc('legend', fontsize=FONT_SIZE) #### PLOT 1 - Distribuição da probabilidade dada pelo modelo para pacientes positivos # Itens Necessário: self.probs_df(csv importado) e pred exame_resp = pred exame_prob = prob # Plot fig, axis = plt.subplots(nrows=1, ncols=1, figsize=(5, 5)) sns.kdeplot(self.probs_df['prob_neg'], shade=True, color='#386796', ax=axis, linestyle="--", label='Casos Negativos') sns.kdeplot(self.probs_df['prob_pos'], shade=True, color='#F06C61', ax=axis, label='Casos positivos') # Pegar eixo XY do Plt object para fazer a interpolação if exame_resp == 0: xi = 1 - exame_prob data_x, data_y = axis.lines[0].get_data() elif exame_resp == 1: xi = exame_prob data_x, data_y = axis.lines[1].get_data() # Fazer a interpolação e plot yi = np.interp(xi, data_x, data_y) axis.plot([xi], [yi], linestyle='None', marker="*", color='black', markersize=10, label='Paciente') # Outras configuracoes do plot axis.legend(loc="upper right") #axis.set_title('Probabilidade de ser COVID Positivo pelo modelo', fontweight='bold') axis.set_xlim([0, 1]) axis.set_ylim([0, axis.get_ylim()[1]]) plt.tight_layout() # Salvar plot 1 plt.savefig(plot_1_name, dpi=DPI_IMAGES, bbox_inches='tight', pad_inches=0.1) plt.close() #### PLOT 2 - SHAP # Necessário: patient_preprocessed, pred e model features = np.array(list(patient_preprocessed.keys())) sample_x = np.array(list(patient_preprocessed.values())) # Calcular SHAP Value explainer = TreeExplainer(model=model) # Faz o objeto SHAP shap_values_sample = explainer.shap_values(sample_x) # Calculo do SHAP expected_value = explainer.expected_value[ exame_resp] # Pega o baseline para a classe predita pelo modelo shap_values_sample = explainer.shap_values( sample_x) # Calcular os SHAP values # Plot #plt.title('Valores SHAP', fontweight='bold') waterfall_plot(expected_value, shap_values_sample[exame_resp], sample_x, feature_names=features, max_display=20, show=False) # Salvar imagem plt.tight_layout() plt.savefig(plot_2_name, dpi=DPI_IMAGES, bbox_inches='tight', pad_inches=0) plt.close() #### PLOT 3 - Distribuição das variáveis mais importantes para o modelo # Necessário: self.train_df(csv importado), patient_preprocessed, pred important_features = [ 'Leucócitos', 'Plaquetas', 'Hemácias', 'Eosinófilos' ] target_0 = self.train_df[self.train_df['target'] == 0][[ 'Leucócitos', 'Plaquetas', 'Hemácias', 'Eosinófilos' ]] target_1 = self.train_df[self.train_df['target'] == 1][[ 'Leucócitos', 'Plaquetas', 'Hemácias', 'Eosinófilos' ]] # Plot fig, axes = plt.subplots(nrows=2, ncols=2, figsize=(10, 5)) # Plot settings #sns.set_color_codes() #st = fig.suptitle("Distribuição das variáveis importantes para o modelo", fontweight='bold') #st.set_y (1.05) # Index col/row r = 0 c = 0 # Loop to plot for feat in important_features: # Plot distribuição sns.kdeplot(list(target_0[feat]), shade=True, color='#386796', ax=axes[r][c], label='Casos Negativos', linestyle="--") sns.kdeplot(list(target_1[feat]), shade=True, color='#F06C61', ax=axes[r][c], label='Casos positivos') # Pegar a curva de densidade a partir do resultado do modelo if pred == 0: data_x, data_y = axes[r][c].lines[0].get_data() elif pred == 1: data_x, data_y = axes[r][c].lines[1].get_data() # Pegar a informação (valor) daquela variável importante xi = patient_preprocessed[feat] yi = np.interp(xi, data_x, data_y) ## Plot ponto na curva axes[r][c].plot([xi], [yi], linestyle='None', marker="*", color='black', markersize=10, label='Paciente') axes[r][c].set_title(feat) axes[r][c].legend(loc="upper right") axes[r][c].set_ylim([0, axes[r][c].get_ylim()[1]]) # Mudar onde sera plotado if c == 0: c += 1 else: r += 1 c = 0 # Ajeitar o plot plt.tight_layout() # Salvar imagem plt.savefig(plot_3_name, dpi=DPI_IMAGES, bbox_inches='tight', pad_inches=0.1) plt.close() #### PLOT 4 - Mapa com SVD para os pacientes # Necessário: train_df(csv importado), patient_preprocessed amostra = pd.DataFrame(patient_preprocessed, index=[ 0, ]).drop(axis=1, columns=['Outra gripe']) # Fazer PCA com SVD via prince package y_train = self.train_df['target'] # Salvar coluna target dados = self.train_df.drop( axis=1, columns=['Outra gripe', 'target']).copy() # Dataset para criar o mapa pca_obj = PCA(n_components=2, random_state=42) # Objeto do PCA pca_obj.fit(dados) # Fit no conjunto de dados componentes = pca_obj.transform( dados) # Criar os componentes principais dos dados transf = pca_obj.transform(amostra) # Transformar paciente para PCA xi = transf.loc[0, 0] # Eixo X do paciente para plot yi = transf.loc[0, 1] # Eixo Y do paciente para plot comp = pd.DataFrame() # Dataframe para conter os componentes comp['C1'] = componentes[0] # Componente Principal 1 comp['C2'] = componentes[1] # Componente Principal 2 comp['TG'] = y_train # Variável target para a mascara comp_0 = comp[comp['TG'] == 0][['C1', 'C2' ]] # Dataframe de CP para negativos comp_1 = comp[comp['TG'] == 1][['C1', 'C2' ]] # Dataframe de CP para positivos # Plot fig, ax = plt.subplots(figsize=(8, 8)) plt.margins(0, 0) sns.scatterplot(ax=ax, data=comp_0, x='C1', y='C2', color='#386796', label='Casos Negativos') sns.scatterplot(ax=ax, data=comp_1, x='C1', y='C2', color='#F06C61', label='Casos Positivos') x_mean, y_mean, width, height, angle = self.build_ellipse( comp_0['C1'], comp_0['C2']) ax.add_patch( Ellipse((x_mean, y_mean), width, height, angle=angle, linewidth=2, color='#386796', fill=True, alpha=0.2)) x_mean, y_mean, width, height, angle = self.build_ellipse( comp_1['C1'], comp_1['C2']) ax.add_patch( Ellipse((x_mean, y_mean), width, height, angle=angle, linewidth=2, color='#F06C61', fill=True, alpha=0.2)) ax.plot([xi], [yi], linestyle='None', marker="*", color='black', markersize=10, label='Paciente') # Configurações do plot #ax.set_title('Similaridade entre pacientes',fontweight='bold') ax.set_xticks([]) ax.set_yticks([]) ax.set_ylabel('') ax.set_xlabel('') handles, labels = ax.get_legend_handles_labels() labels, handles = zip( *sorted(zip(labels, handles), key=lambda t: t[0])) ax.legend(handles, labels, loc="upper right") # Salvar imagem plt.axis('off') plt.savefig(plot_4_name, dpi=DPI_IMAGES, bbox_inches='tight', pad_inches=0) plt.close() # Retornar model_result = { 'prediction': pred, 'probability': str(round(prob * 100, 2)), 'probacurve': plot_1_api, 'shap_img': plot_2_api, 'dist_img': plot_3_api, 'mapa_img': plot_4_api } return model_result """
def graficos(): shap.initjs() # rutas ruta_registro = './otros/registro.csv' ruta_modelo = './otros/final_model.pkl' ruta_imputer = './otros/KNNimputer.pickle' # registro nuevo nuevo_registro = pd.read_csv(ruta_registro) # modelo loaded_model = pickle.load(open(ruta_modelo, 'rb')) # KNN imputer imputer = pickle.load(open(ruta_imputer, 'rb')) # variable a imputar imputar_lista = [ 'LANDAREA_MODE', 'APARTMENTS_MEDI', 'BASEMENTAREA_AVG', 'YEARS_BEGINEXPLUATATION_MODE', 'DEF_60_CNT_SOCIAL_CIRCLE', 'TOTALAREA_MODE', 'REGION_RATING_CLIENT', 'APARTMENTS_MODE', 'YEARS_BEGINEXPLUATATION_MEDI', 'YEARS_BUILD_MODE', 'LIVINGAPARTMENTS_AVG', 'OBS_60_CNT_SOCIAL_CIRCLE', 'COMMONAREA_AVG', 'LIVINGAREA_AVG', 'ENTRANCES_AVG', 'LIVINGAREA_MODE', 'OBS_30_CNT_SOCIAL_CIRCLE', 'NONLIVINGAREA_MODE', 'LIVINGAREA_MEDI', 'NONLIVINGAPARTMENTS_MEDI', 'ELEVATORS_AVG', 'FLOORSMIN_AVG', 'FLAG_DOCUMENT_13', 'FLOORSMAX_AVG', 'FLAG_DOCUMENT_16', 'FLAG_DOCUMENT_18', 'WALLSMATERIAL_MODE_Panel', 'ELEVATORS_MODE', 'FLOORSMIN_MEDI', 'FLOORSMAX_MEDI', 'FLOORSMAX_MODE', 'FLAG_DOCUMENT_14', 'FLAG_DOCUMENT_11', 'FLAG_DOCUMENT_6', 'FLAG_DOCUMENT_5', 'FLAG_DOCUMENT_7', 'FLAG_DOCUMENT_10', 'FLAG_DOCUMENT_19', 'FLAG_DOCUMENT_20' ] # imputacion imputado_nuevo = pd.DataFrame(imputer.transform( np.array(nuevo_registro).reshape(1, -1)), columns=nuevo_registro.columns) # asignacion de las variable imputadas nuevo_registro.loc[:, imputar_lista] = imputado_nuevo.loc[:, imputar_lista] # shap values explainer = shap.TreeExplainer(loaded_model) shap_values = explainer.shap_values(nuevo_registro) # figura 1 plt.clf() shap.force_plot(explainer.expected_value[1], shap_values[1][0, :], nuevo_registro.iloc[0, :], link='logit', matplotlib=True, show=False) plt.savefig('./www/summary_plot1.png', bbox_inches='tight', pad_inches=1.0, dpi=1000) # figura 2 plt.figure(figsize=(500, 1000)) shap.waterfall_plot(explainer.expected_value[1], shap_values[1][0, :], nuevo_registro.columns.values, show=False) plt.savefig("./www/summary_plot2.png", bbox_inches='tight', pad_inches=1.0, dpi=1000) return True
if prediction_explanation: st.subheader('Prediction Explanation') st.markdown( '''<p><small>The waterfall plot is designed to visually display how the values of each feature moves the <code>average</code> stock value to the <code>predicted</code> stock value. Visit <a href="https://price-valuation-explainer.herokuapp.com/" target="_blank">Stock Valuation Explainer</a> for more detail. </small></p>''', unsafe_allow_html=True) shap_values = models[f'{key} Explainer']( data['Features'][key].loc[slice(ticker, ticker), :])[0] _lock = RendererAgg.lock with _lock: shap.waterfall_plot(shap_values, max_display=20) st.pyplot() # SIMILIAR STOCKS if similiar_stocks: st.subheader('Similiar Stocks') st.markdown( '''<p><small>The 10 most similiar stocks using <code>cosine similarity</small></p>''', unsafe_allow_html=True) cols = [ 'Ticker', 'Close', 'Predicted Close', 'Company Name', 'Sector', 'Industry', 'Market-Cap', 'Enterprise Value', 'Price to Earnings Ratio (ttm)', 'Price to Sales Ratio (ttm)', 'Price to Book Value', 'Price to Free Cash Flow (ttm)', 'EV/EBITDA', 'EV/Sales', 'EV/FCF', 'Book to Market Value',
def main(): logging.info("Main script is refreshed...") # Custom functionality for ensuring changing widgets do not cause previous sections to rests state = get_state() st.title("What Makes a Playlist Successful?") st.write( "**This application trains & evaluates playlist success classification models, " "and generates SHAP visualizations for analyzing feature importance**") st.write( "[Created By: Alexander Wong](https://www.linkedin.com/in/alexrobwong/)", unsafe_allow_html=True, ) if st.checkbox("Click to watch recorded demo"): st.video("https://www.youtube.com/watch?v=dPsGxb9lTUY") # Sidebar Inputs ------------------------------------------------------------------------------------------------- experiment_name_input = st.sidebar.text_input("Experiment name:") experiment_name = f"{experiment_name_input}_{str(datetime.now())}" genre_options = GENRES default_ix = GENRES.index("Dance & House") selected_genre = st.sidebar.selectbox("Select genre:", options=genre_options, index=default_ix) # selected genre must be a list genre = [selected_genre] users_threshold = st.sidebar.number_input( "Minimum monthly number of Users:", min_value=10, ) success_threshold = (st.sidebar.slider( "Streaming-ratio success threshold (%):", min_value=1, max_value=99, value=70, ) / 100) holdout_fraction = (st.sidebar.slider( "Test Size (%):", min_value=1, max_value=30, value=5) / 100) model_map = { "Extreme Gradient Boosting": "xgboost", "Decision Tree Classifier": "dt", "Extra Trees Classifier": "et", "Light Gradient Boosting Machine": "lightgbm", "Random Forest Classifier": "rf", } model_selection = list( st.sidebar.multiselect("Models to train:", options=list(model_map.keys()))) optionals = st.sidebar.beta_expander( "Additional Feature Engineering Parameters", False) polynomials_box = optionals.checkbox("Feature Polynomials") interactions_box = optionals.checkbox("Feature Interactions") ratios_box = optionals.checkbox("Feature Ratios") if polynomials_box: polynomials = True else: polynomials = False if interactions_box: interactions = True else: interactions = False if ratios_box: ratios = True else: ratios = False # Experiment & Model Training ------------------------------------------------------------------------------------- train = st.checkbox("Click to train models") if train: # Application can only be run start to finish if xgboost is selected...add it to the list of options exb_added = False if "Extreme Gradient Boosting" not in model_selection: model_selection.append("Extreme Gradient Boosting") exb_added = True # Bugfix - must select at least two models to train other wise model object is used instead of index lgb_added = False if "Light Gradient Boosting Machine" not in model_selection: model_selection.append("Light Gradient Boosting Machine") lgb_added = True include_models = [model_map[x] for x in list(model_selection)] # Check that models are selected - if none are selected, all models will be trained (undesired app behavior) if len(include_models) == 0 or include_models is None: raise Exception( "No models were selected. Please re-start the application") base_frame = pd.read_parquet("data/streamlit_data.parquet") state.genre_frame = base_frame.loc[lambda f: f["genre_1"].isin(genre)] labelled_frame = classify_success(state.genre_frame, users_threshold, success_threshold) train_frame, holdout_frame = create_holdout( labelled_frame, holdout_fraction=holdout_fraction) # PyCaret setup to train models if not state.experiment_complete: with st.spinner("Model Training in Progress"): if exb_added: st.success( "**Extreme Gradient Boosting Model** automatically added by default into model pipeline" ) if lgb_added: st.success( "**Light Gradient Boosting Machine Model** automatically added by default into model pipeline" ) setup( data=train_frame, numeric_features=MODEL_NUMERICAL_FEATURES, categorical_features=MODEL_CATEGORICAL_FEATURES, target="success_streaming_ratio_users", ignore_features=["playlist_uri"], test_data=holdout_frame, session_id=123, ignore_low_variance=True, remove_outliers=True, fix_imbalance=True, remove_multicollinearity=True, log_experiment=True, log_data=True, fold=2, n_jobs=-1, combine_rare_levels=True, experiment_name=experiment_name, silent=True, feature_interaction=interactions, feature_ratio=ratios, polynomial_features=polynomials, ) state.list_models = compare_models(n_select=5, round=3, cross_validation=False, include=include_models) state.experiment_complete = True state.X_train = get_config(variable="X_train") state.y_train = get_config(variable="y_train") state.view = pd.merge(state.y_train, state.X_train, left_index=True, right_index=True).reset_index(drop=True) # Display model training results st.header("Model Training & Testing Results") exp = pull() st.dataframe(exp) st.info("**Models were trained using default parameters**") st.info( "To improve individual model performance," "please consider offline **hyperparameter tuning** techniques such as **Grid Search**. " "To improve overall performance, please consider advanced offline **ensembling** techniques " "such as **Bagging**, **Boosting**, **Stacking**") # Model Definitions models_expander = st.beta_expander("Model Definitions") models_expander.write( "[**Decision Tree Classifier**](https://en.wikipedia.org/wiki/Decision_tree_learning)" ) models_expander.write( "A Decision Tree is a simple representation for " "classifying examples, a form of Supervised Machine Learning where the data is " "continuously split according to a certain parameter. A decision tree starts with a " "single node, which branches into possible outcomes. Each of those outcomes " "leads to additional nodes, which branch off into other possibilities" ) models_expander.write("") models_expander.write( "[**Random Forest Classifier**](https://en.wikipedia.org/wiki/Random_forest)" ) models_expander.write( "An ensemble learning method" "that operates by constructing a multitude of decision trees at training time, " "where each tree is trained on a bootstrap replica of the training data and final " "model classification is decide via majority vote from the constituent trees" ) models_expander.write("") models_expander.write( "[**Extra Trees Classifier**](https://quantdare.com/what-is-the-difference-between" "-extra-trees-and-random-forest/)") models_expander.write( "Extremely randomized trees is similar to Random Forest, " "in that it builds multiple trees and splits nodes using random subsets of features, " "but with two key differences: it does not bootstrap observations (meaning it samples " "without replacement), and nodes are split on random splits, not best splits" ) models_expander.write("") models_expander.write( "[**Extreme Gradient Boosting**](https://en.wikipedia.org/wiki/Gradient_boosting)" ) models_expander.write( "Boosting is a technique which combines a learning " "algorithm in series to achieve a strong learner from many sequentially connected " "weak learners. In case of gradient boosted decision trees algorithm, " "the weak learners are decision trees where each tree attempts to minimize the errors " "of previous tree. Trees in boosting are weak learners but adding many trees in series a" "and each focusing on the errors from previous one make boosting a " "highly efficient and accurate model") models_expander.write("") models_expander.write( "[**Light Gradient Boosting Machine**](https://lightgbm.readthedocs.io/en/latest/)" ) models_expander.write( "A gradient boosting framework for machine " "learning originally developed by Microsoft. Similar to Extreme Gradient Boosting, " "it is based on decision tree algorithms, however unlike Extreme Gradient Boosting, " "the algorithm splits the tree leaf wise instead of level wise") models_expander.write("") # Model Evaluation Metrics metrics_expander = st.beta_expander("Model Evaluation Metrics") metrics_expander.write("**Accuracy**") metrics_expander.write( "Accuracy is defined as the percentage of correct predictions for the test data." " It can be calculated easily by dividing the number of correct predictions by the " "number of total predictions.") metrics_expander.write("") metrics_expander.write("**AUC**") metrics_expander.write( "An ROC curve (receiver operating characteristic curve) is a graph showing the " "performance of a classification model at all classification thresholds. This curve " "plots the True Positive Rate (TP) and False Negative Rate (FP)") metrics_expander.write("") metrics_expander.write("**Recall**") metrics_expander.write( "Recall is defined as the fraction of examples which were predicted to belong " "to a class with respect to all of the examples that truly belong in the class." ) metrics_expander.write("") metrics_expander.write("**Precision**") metrics_expander.write( "Precision is defined as the fraction of relevant examples (true positives) among " "all of the examples which were predicted to belong in a certain class." ) metrics_expander.write("") metrics_expander.write("**F1**") metrics_expander.write( "The traditional F-measure or balanced F-score (F1 score) is the harmonic mean " "of precision and recall and is calculated as --> F1 score = 2 * (Precision * Recall) / " "(Precision + Recall)") metrics_expander.write("") metrics_expander.write("**Kappa**") metrics_expander.write( "The Kappa statistic (or value) is a metric that compares an Observed Accuracy with " "an Expected Accuracy (random chance). The kappa statistic is used not only to evaluate " "a single classifier, but also to evaluate classifiers amongst themselves. In addition, " "it takes into account random chance (agreement with a random classifier), which" " generally means it is less misleading than simply using accuracy as a metric " "(an Observed Accuracy of 80% is a lot less impressive with an Expected Accuracy of " "75% versus an Expected Accuracy of 50%)") metrics_expander.write("") metrics_expander.write("**MCC**") metrics_expander.write( "Unlike the other metrics discussed above, MCC takes all the cells of the Confusion" " Matrix into consideration in its formula --> MCC = TP * TN – FP * FN / √ (TP +FP) * " "(TP + FN) * (TN + FP) * (TN + FN) .Similar to Correlation Coefficient, the range of " "values of MCC lie between -1 to +1. A model with a score of +1 is a perfect model " "and -1 is a poor model. This property is one of the key usefulness of MCC as it" " leads to easy interpretability.") metrics_expander.write("") # Additional model data opts = st.beta_expander("Additional Model Data", False) # Download the training data as an excel file if opts.button("Display Link to Download Model Training Data"): st.markdown(get_table_download_link(state.view), unsafe_allow_html=True) # Prompt to launch MLFlow if opts.button("Display Link to Spotify Model Training History"): st.info( "Note that this application uses MLFlow only when both the application and MLFlow are " "deployed locally") # Overall importance ------------------------------------------------------------------------------------------ st.write("") # Intentional extra blank spaces st.write("") st.header(f"Success Drives for {selected_genre} Playlists") dict_models = {} for i, model in enumerate(exp.index): dict_models[model] = i user_selected_model = st.selectbox( "Select model to view feature importance:", exp.index) state.importance = st.checkbox("Click to calculate feature importance") if state.importance and state.experiment_complete: state.new_selected_model = state.list_models[ dict_models[user_selected_model]] st.write("**Model parameters: **") st.write(state.new_selected_model) st.write("") st.write("**Generating Visualizations...**") bar = st.progress(0) if state.selected_model != state.new_selected_model: state.selected_model = state.new_selected_model state.explainer = shap.TreeExplainer(state.selected_model) state.shap_values = state.explainer.shap_values( state.X_train.to_numpy()) bar.progress(25) # Overall Feature Importance ------------------------------------------------------------------------- st.subheader("Success Drivers - Average") st.pyplot( shap.summary_plot(state.shap_values, state.X_train, plot_type="bar")) # Violin plot and waterfall plot only available at this time for XGBoost model if user_selected_model != "xgboost": st.warning( "This PoC has only been configured for when **Extreme Gradient Boosting " "(xgboost)** is selected for analysis") bar.progress(100) st.stop() else: # Violin Feature Importance -------------------------------------------------------------------------- st.subheader( f"Success Drivers - All {selected_genre} Playlists") st.pyplot(shap.summary_plot(state.shap_values, state.X_train)) bar.progress(50) # Dependence plots for each of the top 3 features ---------------------------------------------------- st.header(f"Shapley Dependence for {selected_genre} Playlists") vals = np.abs(state.shap_values).mean(0) feature_importance = pd.DataFrame( list(zip(state.X_train.columns, vals)), columns=["col_name", "feature_importance_vals"], ) feature_importance = (feature_importance.sort_values( by=["feature_importance_vals"], ascending=False).reset_index(drop=True).head(3)) top_features = list(feature_importance["col_name"]) for feature in top_features: index = list(state.X_train.columns).index(feature) st.subheader(f"Shapley Value Dependence for {feature}") st.pyplot( shap.dependence_plot( index, state.shap_values, state.X_train, alpha=0.5, interaction_index=None, )) bar.progress(70) # Individual importance ------------------------------------------------------------------------------- st.header( f"Explaining {selected_genre} Playlist Success Prediction") # Display the data frame for users to visually see the row they want to analyze st.subheader("Model Training Data") st.dataframe(state.view) state.new_row = int( st.number_input( "Row from dataframe to inspect", min_value=0, max_value=len(state.view), value=10, )) if state.row != state.new_row: state.row = state.new_row shap_object = ShapObject( base_values=state.explainer.expected_value, values=state.explainer.shap_values( state.X_train)[state.row, :], feature_names=state.X_train.columns, data=state.X_train.iloc[state.row, :], ) bar.progress(85) st.subheader( f"Feature Contributions to {selected_genre} Playlist #{state.row}" ) st.pyplot(shap.waterfall_plot(shap_object)) bar.progress(100) st.stop() else: st.stop() else: st.stop() else: st.stop()
predict_proba = RFModel.predict_proba(data_for_prediction_array) import warnings warnings.filterwarnings("ignore") # Create SHAP explainer explainer = shap.TreeExplainer(RFModel) # Get shap values for observtation of interest shap_values = explainer.shap_values(data_for_prediction.values, check_additivity=False) decisionhtml = shap.decision_plot(base_value= explainer.expected_value[1], shap_values= shap_values[1], features= data_for_prediction, feature_names=data_for_prediction.columns.tolist(),show = False) plt.savefig('decisionPlot.pdf') plt.close() onedshap_values = shap_values[1].flatten() shap.waterfall_plot(explainer.expected_value[1], onedshap_values, feature_names=data_for_prediction.columns, max_display=10, show=False) plt.savefig('waterfallPlot.pdf') plt.close() # SHAP Plots for Class 1 (sRNA-mRNA Interaction) forcehtml = shap.force_plot(explainer.expected_value[1], shap_values[1], data_for_prediction) shap.save_html(out_file = 'forcePlot.html', full_html=False, plot = forcehtml) elif((len(sys.argv) - 1) < 2): print("Error: Required parameters not passed! Please pass two parameters, sRNA ID and mRNA ID.") else:
shap.force_plot(mlp_interpreter.explainer.expected_value[pred], mlp_shap_values[pred][sample], features=test_features[sample, 1:].numpy(), feature_names=feat_names) # #### Sample decision plot shap.decision_plot(mlp_interpreter.explainer.expected_value[pred], mlp_shap_values[pred][sample], features=test_features[sample, 1:].numpy(), feature_names=feat_names) # #### Sample waterfall plot shap.waterfall_plot(mlp_interpreter.explainer.expected_value[pred], mlp_shap_values[pred][sample], features=test_features[sample, 1:].numpy(), feature_names=feat_names) test_features[:, 1:].shape test_features[0].shape test_features[0].unsqueeze(0).shape # ## Interpreting XGBoost # ### Loading the model xgb_model = joblib.load(f'{models_path}xgb/checkpoint_27_01_2020_04_47.model') xgb_model
def explain(client): st.header('Model Explainability') st.markdown("""<p style="text-align:justify;"> Model explainability is the ability to explain the internal mechanics of a model in human terms. It is an important tool to make reasoning behind each decision in machine learning transparent and repeatable.<br> <a href="https://github.com/slundberg/shap" target="_blank">SHAP</a> (SHapley Additive exPlanations) is a python module that uses game theoretic approach to explain machine learning models. It will be used to explore the explainability of all XGBoost models for different wind farms.</p>""", unsafe_allow_html=True) farm_select = st.sidebar.selectbox('Select a farm', FARM_NAME_LIST) show_gif(icon='default') farm = FARM_LIST[FARM_NAME_LIST.index(farm_select)] df = load_data(client, farm, limit=200) models = load_models() model = models[farm] with st.spinner('Running Calculations...'): X, _ = transform_data(df) shap_val = model.get_booster().predict(DMatrix(X), pred_contribs=True) expected_val = shap_val[0][-1] shap_val = np.delete(shap_val, obj=-1, axis=1) col_name = [format_title(col) for col in list(X.columns)] importance = st.beta_expander('Feature importance based on SHAP value', expanded=True) importance.markdown("""<p style="text-align:justify;"> The following plot summarizes feature importance based on SHAP The following plot summarizes feature importance based on SHAP The following plot summarizes feature importance based on SHAP values (i.e. how much each feature changes the model outcome values (i.e. how much each feature changes the model outcome values (i.e. how much each feature changes the model outcome when conditioning on that feature). The features are sorted by when conditioning on that feature). The features are sorted by when conditioning on that feature). The features are sorted by the sum of the magnitudes of SHAP values. The colour represents the sum of the magnitudes of SHAP values. The colour represents the sum of the magnitudes of SHAP values. The colour represents feature value, while red is high and blue is low.<br> For example, if a red (high feature value) data point shows a For example, if a red (high feature value) data point shows a For example, if a red (high feature value) data point shows a positive SHAP value, it increases the predicted value; if the SHAP value is negative, it lowers the predicted value. A point SHAP value is negative, it lowers the predicted value. A point SHAP value is negative, it lowers the predicted value. A point far away from zero point also has a higher impact (either far away from zero point also has a higher impact (either far away from zero point also has a higher impact (either negative or positive) than that is near zero point.</p>""", unsafe_allow_html=True) summary_plot(shap_val, X, show=False, feature_names=col_name) importance.pyplot(bbox_inches='tight', dpi=150) contribution = st.beta_expander( 'Feature contribution for individual prediction') contribution.markdown("""<p style="text-align:justify;"> The waterfall plot below demonstrates how much each feature contributes to pushing the model from the baseline value (indicated by <i>E[f(X)]</i>) to model output (indicated by <i>f(X)</i>) in an intuitive manner.<br> The plot can show all the individual predictions for today & tomorrow (48 h in total). Use the slider below to choose which hour's prediction you'd like to view.</p>""", unsafe_allow_html=True) pred = df[-48:].reset_index(drop=True).prediction i = contribution.slider('Select the hour', min_value=1, max_value=48, value=24) plt.rcParams.update({'font.size': 20}) fig, ax = plt.subplots(figsize=(18, 2)) ax.plot(range(1, 49), pred, c='#1e88e5') ax.scatter(i, pred[i - 1], c='#ff0d57', s=300) ax.xaxis.set_ticks(np.arange(0, 48, step=6)) ax.set_xlim(1, 48) ax.set_xlabel('Hour') ax.tick_params(axis="y", direction="in", pad=-42) ax.get_yaxis().set_ticks([]) contribution.pyplot(bbox_inches='tight', dpi=150, pad_inches=0.01) waterfall_plot(expected_val, shap_val[-48:][i - 1], feature_names=col_name, max_display=10, show=False) contribution.pyplot(bbox_inches='tight', dpi=150, pad_inches=0)