Beispiel #1
0
def shap_explain_xgb(data_set_path, plot_n=None):

    pickle_path = data_set_path.replace('.csv', '_xgb_model.pkl')
    assert os.path.isfile(
        pickle_path
    ), 'You must run listing 9.6 to save an XGB regression model first'
    with open(pickle_path, 'rb') as fid:
        xgb_model = pickle.load(fid)

    current_df = reload_churn_data(data_set_path,
                                   'current',
                                   '8.3',
                                   is_customer_data=True)
    explainer = shap.TreeExplainer(xgb_model,
                                   feature_perturbation="interventional",
                                   model_output='probability',
                                   data=current_df)
    shap_values = explainer(current_df)
    shap.summary_plot(shap_values, current_df, show=False)
    save_file = data_set_path.replace('.csv', '_shap_summary_xgb.png')
    print(f'Saving SHAP Explanation to {save_file}')
    plt.tight_layout()
    plt.savefig(save_file, format='png')
    plt.close()

    if plot_n is not None:
        for n in plot_n:
            shap.waterfall_plot(shap_values[n], show=False)
            save_file = data_set_path.replace('.csv',
                                              f'_shap_water_xgb_{n}.png')
            plt.tight_layout()
            plt.savefig(save_file, format='png')
            plt.close()
Beispiel #2
0
def shapley_tree(model_predict, obs, dataset, column_names, plot_draw=False):
    explainer = shap.KernelExplainer(model_predict, shap.sample(dataset, 100))
    shap_values = explainer.shap_values(obs)
    if plot_draw:
        shap.waterfall_plot(explainer.expected_value,
                            shap_values,
                            feature_names=column_names)
    return shap_values, explainer.expected_value
Beispiel #3
0
def shapley_diff(model,
                 obs,
                 dataset,
                 column_names,
                 treatment_col,
                 plot_draw=True):
    shap_t0, exp0 = shapley_tree(predict_treatment(model, treatment_col, 0),
                                 obs, dataset, column_names)
    shap_t1, exp1 = shapley_tree(predict_treatment(model, treatment_col, 1),
                                 obs, dataset, column_names)
    if plot_draw:
        shap.waterfall_plot(exp1 - exp0,
                            shap_t1 - shap_t0,
                            feature_names=column_names)
    return shap_t1 - shap_t0, exp1 - exp0
Beispiel #4
0
 def waterfall_plot(self, row_idx=None, class_id=0, **kwargs):
     "Plots explaination of single prediction as waterfall plot"
     shap_vals, exp_val = _get_values(self, class_id)
     n_rows = shap_vals.shape[0]
     row_idx = random.randint(0, n_rows-1) if row_idx is None else row_idx
     print(f'Displaying row {row_idx} of {n_rows} (use `row_idx` to specify another row)')
     feat_names = self.test_data.columns
     return shap.waterfall_plot(exp_val, shap_vals[row_idx,:], feature_names=feat_names, **kwargs)
Beispiel #5
0
 def waterfall_plot(self, row_index=None, class_id=0, **kwargs):
     """
     Plots an explanation of a single prediction as a waterfall plot.
     
     `row_index` is the index of the row in `test_data` that will be analyzed, if it is None it will be drawed at random
     `class_id` is used to indicate the class of interest for classification models, it can ba an int or a string
     
     For an up-to-date list of the parameters, see: https://github.com/slundberg/shap/blob/master/shap/plots/waterfall.py
     """
     shap_values, expected_value = _get_values(self, class_id)
     nb_rows = shap_values.shape[0]
     row_index = random.randint(0,nb_rows-1) if row_index is None else row_index
     print("Displaying row", row_index, "of", nb_rows, "(use `row_index` to specify another row)")
     feature_names = self.test_data.columns
     return shap.waterfall_plot(expected_value, shap_values[row_index,:], feature_names=feature_names, **kwargs)
Beispiel #6
0
#    ax.set_ylim([-0.2,0.2])
    ax.set_title(feat)
    ind+=1
plt.subplots_adjust(hspace=0.8)
plt.savefig('shap_sc.png')

**Decision_plot()** is interesting as it shows how the prediction is formed from the contributions of different features.

shap.decision_plot(explainerXGB.expected_value,shap_values_XGB_test[0:100],features)

**Force_plot** is similar to decision_plot. We plot only the first 100 instances because it would be very slow to draw a force_plot with all the instances.

shap.force_plot(explainerXGB.expected_value,shap_values_XGB_test[0:100],features,figsize=(20,10))

**Waterfall_plot** is great when you want to analyse one instance.

shap.waterfall_plot(explainerXGB.expected_value,shap_values_XGB_test[2000],x_df.iloc[2000],features)

### Other interpretation methods

For the following methods, we need to use the Xgboost's Scikit-learn wrapper **XGBRegressor()** to make our Xgboost model to be compatible with the Scikit-learn ecosystem.

m_depth = 5
eta = 0.1
ssample = 0.8
col_tree = 0.8
m_child_w = 3
gam = 1.
objective = 'reg:squarederror'
param = {'max_depth': m_depth, 'eta': eta, 'subsample': ssample,
         'colsample_bytree': col_tree, 'min_child_weight' : m_child_w, 'gamma' : gam,'objective' : objective}
interpreter.feat_scores[pred, sample].shape

len(interpreter.feat_scores[pred, sample].shape)

interpreter.feat_scores[pred, sample]

model(interpreter.test_data[pred, sample, 2:].unsqueeze(0).unsqueeze(0))

np.sum(interpreter.feat_scores[pred, sample]) + interpreter.explainer.expected_value[0]

interpreter.feat_names

interpreter.test_data[pred, sample, :].numpy()

shap.waterfall_plot(interpreter.explainer.expected_value[0], 
                    interpreter.feat_scores[pred, sample],
                    features=interpreter.test_data[pred, sample, 2:].numpy(), 
                    feature_names=interpreter.feat_names)

shap.waterfall_plot(interpreter.explainer.expected_value[0], 
                    interpreter.feat_scores[pred, sample],
                    features=interpreter.test_data[pred, sample, 2:].numpy(), 
                    feature_names=interpreter.feat_names,
                    max_display=2)

# du.visualization.shap_waterfall_plot(interpreter.explainer.expected_value[0], interpreter.feat_scores[pred, sample],
du.visualization.shap_waterfall_plot(0, interpreter.feat_scores[pred, sample],
                                     interpreter.test_data[pred, sample, 2:], interpreter.feat_names,
                                     max_display=2)

# +
fig = go.Figure()
    def model_interpretation(self, patient_id, patient_preprocessed, pred,
                             prob, model):
        '''
        Fazer gráficos avaliativos do modelo.
        Argumentos:
            patient_id = string referente a identificação do paciente
            patient_preprocessed = dicionario contendo dados do exame do paciente
            pred = classe predita pelo modelo
            prob = probabilidade referente a classe predita pelo modelo
            model = objeto do modelo
        '''
        #### Pegar variaveis necessárias para o plot (import csv)

        #### Nome dos plots

        plot_1_name = 'app/ai_models/temp/probacurve-' + str(
            patient_id) + '.png'
        plot_2_name = 'app/ai_models/temp/shap-' + str(patient_id) + '.png'
        plot_3_name = 'app/ai_models/temp/dist-' + str(patient_id) + '.png'
        plot_4_name = 'app/ai_models/temp/mapa-' + str(patient_id) + '.png'

        #URL API PLOTS
        plot_1_api = "http://" + self.IP + ":" + self.API_PORT + "/api/media/probacurve-" + str(
            patient_id) + ".png"
        plot_2_api = "http://" + self.IP + ":" + self.API_PORT + "/api/media/shap-" + str(
            patient_id) + ".png"
        plot_3_api = "http://" + self.IP + ":" + self.API_PORT + "/api/media/dist-" + str(
            patient_id) + ".png"
        plot_4_api = "http://" + self.IP + ":" + self.API_PORT + "/api/media/mapa-" + str(
            patient_id) + ".png"

        #### Configurações gerais do plt
        DPI_IMAGES = 100
        FONT_SIZE = 8
        FONT_NAME = 'sans-serif'
        plt.rc('font', family=FONT_NAME, size=FONT_SIZE)
        plt.rc('axes', titlesize=FONT_SIZE, labelsize=FONT_SIZE)
        plt.rc('xtick', labelsize=FONT_SIZE)
        plt.rc('ytick', labelsize=FONT_SIZE)
        plt.rc('legend', fontsize=FONT_SIZE)

        #### PLOT 1 - Distribuição da probabilidade dada pelo modelo para pacientes positivos
        # Itens Necessário: self.probs_df(csv importado) e pred
        exame_resp = pred
        exame_prob = prob
        # Plot
        fig, axis = plt.subplots(nrows=1, ncols=1, figsize=(5, 5))
        sns.kdeplot(self.probs_df['prob_neg'],
                    shade=True,
                    color='#386796',
                    ax=axis,
                    linestyle="--",
                    label='Casos Negativos')
        sns.kdeplot(self.probs_df['prob_pos'],
                    shade=True,
                    color='#F06C61',
                    ax=axis,
                    label='Casos positivos')
        # Pegar eixo XY do Plt object para fazer a interpolação
        if exame_resp == 0:
            xi = 1 - exame_prob
            data_x, data_y = axis.lines[0].get_data()
        elif exame_resp == 1:
            xi = exame_prob
            data_x, data_y = axis.lines[1].get_data()
        # Fazer a interpolação e plot
        yi = np.interp(xi, data_x, data_y)
        axis.plot([xi], [yi],
                  linestyle='None',
                  marker="*",
                  color='black',
                  markersize=10,
                  label='Paciente')
        # Outras configuracoes do plot
        axis.legend(loc="upper right")
        #axis.set_title('Probabilidade de ser COVID Positivo pelo modelo', fontweight='bold')
        axis.set_xlim([0, 1])
        axis.set_ylim([0, axis.get_ylim()[1]])
        plt.tight_layout()
        # Salvar plot 1
        plt.savefig(plot_1_name,
                    dpi=DPI_IMAGES,
                    bbox_inches='tight',
                    pad_inches=0.1)
        plt.close()

        #### PLOT 2 - SHAP
        # Necessário: patient_preprocessed, pred e model
        features = np.array(list(patient_preprocessed.keys()))
        sample_x = np.array(list(patient_preprocessed.values()))
        # Calcular SHAP Value
        explainer = TreeExplainer(model=model)  # Faz o objeto SHAP
        shap_values_sample = explainer.shap_values(sample_x)  # Calculo do SHAP
        expected_value = explainer.expected_value[
            exame_resp]  # Pega o baseline para a classe predita pelo modelo
        shap_values_sample = explainer.shap_values(
            sample_x)  # Calcular os SHAP values
        # Plot
        #plt.title('Valores SHAP', fontweight='bold')
        waterfall_plot(expected_value,
                       shap_values_sample[exame_resp],
                       sample_x,
                       feature_names=features,
                       max_display=20,
                       show=False)
        # Salvar imagem
        plt.tight_layout()
        plt.savefig(plot_2_name,
                    dpi=DPI_IMAGES,
                    bbox_inches='tight',
                    pad_inches=0)
        plt.close()

        #### PLOT 3 - Distribuição das variáveis mais importantes para o modelo
        # Necessário: self.train_df(csv importado), patient_preprocessed, pred
        important_features = [
            'Leucócitos', 'Plaquetas', 'Hemácias', 'Eosinófilos'
        ]
        target_0 = self.train_df[self.train_df['target'] == 0][[
            'Leucócitos', 'Plaquetas', 'Hemácias', 'Eosinófilos'
        ]]
        target_1 = self.train_df[self.train_df['target'] == 1][[
            'Leucócitos', 'Plaquetas', 'Hemácias', 'Eosinófilos'
        ]]
        # Plot
        fig, axes = plt.subplots(nrows=2, ncols=2, figsize=(10, 5))
        # Plot settings
        #sns.set_color_codes()
        #st = fig.suptitle("Distribuição das variáveis importantes para o modelo", fontweight='bold')
        #st.set_y (1.05)
        # Index col/row
        r = 0
        c = 0
        # Loop to plot
        for feat in important_features:
            # Plot distribuição
            sns.kdeplot(list(target_0[feat]),
                        shade=True,
                        color='#386796',
                        ax=axes[r][c],
                        label='Casos Negativos',
                        linestyle="--")
            sns.kdeplot(list(target_1[feat]),
                        shade=True,
                        color='#F06C61',
                        ax=axes[r][c],
                        label='Casos positivos')
            # Pegar a curva de densidade a partir do resultado do modelo
            if pred == 0:
                data_x, data_y = axes[r][c].lines[0].get_data()
            elif pred == 1:
                data_x, data_y = axes[r][c].lines[1].get_data()
            # Pegar a informação (valor) daquela variável importante
            xi = patient_preprocessed[feat]
            yi = np.interp(xi, data_x, data_y)
            ## Plot ponto na curva
            axes[r][c].plot([xi], [yi],
                            linestyle='None',
                            marker="*",
                            color='black',
                            markersize=10,
                            label='Paciente')
            axes[r][c].set_title(feat)
            axes[r][c].legend(loc="upper right")
            axes[r][c].set_ylim([0, axes[r][c].get_ylim()[1]])
            # Mudar onde sera plotado
            if c == 0:
                c += 1
            else:
                r += 1
                c = 0
        # Ajeitar o plot
        plt.tight_layout()
        # Salvar imagem
        plt.savefig(plot_3_name,
                    dpi=DPI_IMAGES,
                    bbox_inches='tight',
                    pad_inches=0.1)
        plt.close()

        #### PLOT 4 - Mapa com SVD para os pacientes
        # Necessário: train_df(csv importado), patient_preprocessed
        amostra = pd.DataFrame(patient_preprocessed, index=[
            0,
        ]).drop(axis=1, columns=['Outra gripe'])

        # Fazer PCA com SVD via prince package
        y_train = self.train_df['target']  # Salvar coluna target
        dados = self.train_df.drop(
            axis=1, columns=['Outra gripe',
                             'target']).copy()  # Dataset para criar o mapa
        pca_obj = PCA(n_components=2, random_state=42)  # Objeto do PCA
        pca_obj.fit(dados)  # Fit no conjunto de dados
        componentes = pca_obj.transform(
            dados)  # Criar os componentes principais dos dados
        transf = pca_obj.transform(amostra)  # Transformar paciente para PCA
        xi = transf.loc[0, 0]  # Eixo X do paciente para plot
        yi = transf.loc[0, 1]  # Eixo Y do paciente para plot
        comp = pd.DataFrame()  # Dataframe para conter os componentes
        comp['C1'] = componentes[0]  # Componente Principal 1
        comp['C2'] = componentes[1]  # Componente Principal 2
        comp['TG'] = y_train  # Variável target para a mascara
        comp_0 = comp[comp['TG'] == 0][['C1', 'C2'
                                        ]]  # Dataframe de CP para negativos
        comp_1 = comp[comp['TG'] == 1][['C1', 'C2'
                                        ]]  # Dataframe de CP para positivos
        # Plot
        fig, ax = plt.subplots(figsize=(8, 8))
        plt.margins(0, 0)
        sns.scatterplot(ax=ax,
                        data=comp_0,
                        x='C1',
                        y='C2',
                        color='#386796',
                        label='Casos Negativos')
        sns.scatterplot(ax=ax,
                        data=comp_1,
                        x='C1',
                        y='C2',
                        color='#F06C61',
                        label='Casos Positivos')
        x_mean, y_mean, width, height, angle = self.build_ellipse(
            comp_0['C1'], comp_0['C2'])
        ax.add_patch(
            Ellipse((x_mean, y_mean),
                    width,
                    height,
                    angle=angle,
                    linewidth=2,
                    color='#386796',
                    fill=True,
                    alpha=0.2))
        x_mean, y_mean, width, height, angle = self.build_ellipse(
            comp_1['C1'], comp_1['C2'])
        ax.add_patch(
            Ellipse((x_mean, y_mean),
                    width,
                    height,
                    angle=angle,
                    linewidth=2,
                    color='#F06C61',
                    fill=True,
                    alpha=0.2))
        ax.plot([xi], [yi],
                linestyle='None',
                marker="*",
                color='black',
                markersize=10,
                label='Paciente')
        # Configurações do plot
        #ax.set_title('Similaridade entre pacientes',fontweight='bold')
        ax.set_xticks([])
        ax.set_yticks([])
        ax.set_ylabel('')
        ax.set_xlabel('')
        handles, labels = ax.get_legend_handles_labels()
        labels, handles = zip(
            *sorted(zip(labels, handles), key=lambda t: t[0]))
        ax.legend(handles, labels, loc="upper right")
        # Salvar imagem
        plt.axis('off')
        plt.savefig(plot_4_name,
                    dpi=DPI_IMAGES,
                    bbox_inches='tight',
                    pad_inches=0)
        plt.close()

        # Retornar
        model_result = {
            'prediction': pred,
            'probability': str(round(prob * 100, 2)),
            'probacurve': plot_1_api,
            'shap_img': plot_2_api,
            'dist_img': plot_3_api,
            'mapa_img': plot_4_api
        }
        return model_result
        """
Beispiel #9
0
def graficos():

    shap.initjs()
    # rutas
    ruta_registro = './otros/registro.csv'
    ruta_modelo = './otros/final_model.pkl'
    ruta_imputer = './otros/KNNimputer.pickle'

    # registro nuevo
    nuevo_registro = pd.read_csv(ruta_registro)

    # modelo
    loaded_model = pickle.load(open(ruta_modelo, 'rb'))

    # KNN imputer
    imputer = pickle.load(open(ruta_imputer, 'rb'))

    # variable a imputar
    imputar_lista = [
        'LANDAREA_MODE', 'APARTMENTS_MEDI', 'BASEMENTAREA_AVG',
        'YEARS_BEGINEXPLUATATION_MODE', 'DEF_60_CNT_SOCIAL_CIRCLE',
        'TOTALAREA_MODE', 'REGION_RATING_CLIENT', 'APARTMENTS_MODE',
        'YEARS_BEGINEXPLUATATION_MEDI', 'YEARS_BUILD_MODE',
        'LIVINGAPARTMENTS_AVG', 'OBS_60_CNT_SOCIAL_CIRCLE', 'COMMONAREA_AVG',
        'LIVINGAREA_AVG', 'ENTRANCES_AVG', 'LIVINGAREA_MODE',
        'OBS_30_CNT_SOCIAL_CIRCLE', 'NONLIVINGAREA_MODE', 'LIVINGAREA_MEDI',
        'NONLIVINGAPARTMENTS_MEDI', 'ELEVATORS_AVG', 'FLOORSMIN_AVG',
        'FLAG_DOCUMENT_13', 'FLOORSMAX_AVG', 'FLAG_DOCUMENT_16',
        'FLAG_DOCUMENT_18', 'WALLSMATERIAL_MODE_Panel', 'ELEVATORS_MODE',
        'FLOORSMIN_MEDI', 'FLOORSMAX_MEDI', 'FLOORSMAX_MODE',
        'FLAG_DOCUMENT_14', 'FLAG_DOCUMENT_11', 'FLAG_DOCUMENT_6',
        'FLAG_DOCUMENT_5', 'FLAG_DOCUMENT_7', 'FLAG_DOCUMENT_10',
        'FLAG_DOCUMENT_19', 'FLAG_DOCUMENT_20'
    ]

    # imputacion
    imputado_nuevo = pd.DataFrame(imputer.transform(
        np.array(nuevo_registro).reshape(1, -1)),
                                  columns=nuevo_registro.columns)

    # asignacion de las variable imputadas
    nuevo_registro.loc[:, imputar_lista] = imputado_nuevo.loc[:, imputar_lista]

    # shap values
    explainer = shap.TreeExplainer(loaded_model)
    shap_values = explainer.shap_values(nuevo_registro)

    # figura 1
    plt.clf()
    shap.force_plot(explainer.expected_value[1],
                    shap_values[1][0, :],
                    nuevo_registro.iloc[0, :],
                    link='logit',
                    matplotlib=True,
                    show=False)
    plt.savefig('./www/summary_plot1.png',
                bbox_inches='tight',
                pad_inches=1.0,
                dpi=1000)

    # figura 2
    plt.figure(figsize=(500, 1000))
    shap.waterfall_plot(explainer.expected_value[1],
                        shap_values[1][0, :],
                        nuevo_registro.columns.values,
                        show=False)
    plt.savefig("./www/summary_plot2.png",
                bbox_inches='tight',
                pad_inches=1.0,
                dpi=1000)

    return True
        if prediction_explanation:
            st.subheader('Prediction Explanation')
            st.markdown(
                '''<p><small>The waterfall plot is designed to visually display how
                        the values of each feature moves the <code>average</code> stock value to the
                        <code>predicted</code> stock value. Visit
                        <a href="https://price-valuation-explainer.herokuapp.com/" target="_blank">Stock Valuation Explainer</a> for more detail.
                        </small></p>''',
                unsafe_allow_html=True)

            shap_values = models[f'{key} Explainer'](
                data['Features'][key].loc[slice(ticker, ticker), :])[0]

            _lock = RendererAgg.lock
            with _lock:
                shap.waterfall_plot(shap_values, max_display=20)
                st.pyplot()

        # SIMILIAR STOCKS
        if similiar_stocks:
            st.subheader('Similiar Stocks')
            st.markdown(
                '''<p><small>The 10 most similiar stocks using <code>cosine similarity</small></p>''',
                unsafe_allow_html=True)

            cols = [
                'Ticker', 'Close', 'Predicted Close', 'Company Name', 'Sector',
                'Industry', 'Market-Cap', 'Enterprise Value',
                'Price to Earnings Ratio (ttm)', 'Price to Sales Ratio (ttm)',
                'Price to Book Value', 'Price to Free Cash Flow (ttm)',
                'EV/EBITDA', 'EV/Sales', 'EV/FCF', 'Book to Market Value',
def main():
    logging.info("Main script is refreshed...")

    # Custom functionality for ensuring changing widgets do not cause previous sections to rests
    state = get_state()
    st.title("What Makes a Playlist Successful?")
    st.write(
        "**This application trains & evaluates playlist success classification models, "
        "and generates SHAP visualizations for analyzing feature importance**")
    st.write(
        "[Created By: Alexander Wong](https://www.linkedin.com/in/alexrobwong/)",
        unsafe_allow_html=True,
    )

    if st.checkbox("Click to watch recorded demo"):
        st.video("https://www.youtube.com/watch?v=dPsGxb9lTUY")

    # Sidebar Inputs -------------------------------------------------------------------------------------------------
    experiment_name_input = st.sidebar.text_input("Experiment name:")
    experiment_name = f"{experiment_name_input}_{str(datetime.now())}"

    genre_options = GENRES
    default_ix = GENRES.index("Dance & House")
    selected_genre = st.sidebar.selectbox("Select genre:",
                                          options=genre_options,
                                          index=default_ix)

    # selected genre must be a list
    genre = [selected_genre]

    users_threshold = st.sidebar.number_input(
        "Minimum monthly number of Users:",
        min_value=10,
    )
    success_threshold = (st.sidebar.slider(
        "Streaming-ratio success threshold (%):",
        min_value=1,
        max_value=99,
        value=70,
    ) / 100)
    holdout_fraction = (st.sidebar.slider(
        "Test Size (%):", min_value=1, max_value=30, value=5) / 100)
    model_map = {
        "Extreme Gradient Boosting": "xgboost",
        "Decision Tree Classifier": "dt",
        "Extra Trees Classifier": "et",
        "Light Gradient Boosting Machine": "lightgbm",
        "Random Forest Classifier": "rf",
    }
    model_selection = list(
        st.sidebar.multiselect("Models to train:",
                               options=list(model_map.keys())))
    optionals = st.sidebar.beta_expander(
        "Additional Feature Engineering Parameters", False)
    polynomials_box = optionals.checkbox("Feature Polynomials")
    interactions_box = optionals.checkbox("Feature Interactions")
    ratios_box = optionals.checkbox("Feature Ratios")

    if polynomials_box:
        polynomials = True
    else:
        polynomials = False

    if interactions_box:
        interactions = True
    else:
        interactions = False

    if ratios_box:
        ratios = True
    else:
        ratios = False

    # Experiment & Model Training -------------------------------------------------------------------------------------
    train = st.checkbox("Click to train models")
    if train:

        # Application can only be run start to finish if xgboost is selected...add it to the list of options
        exb_added = False
        if "Extreme Gradient Boosting" not in model_selection:
            model_selection.append("Extreme Gradient Boosting")
            exb_added = True

        # Bugfix - must select at least two models to train other wise model object is used instead of index
        lgb_added = False
        if "Light Gradient Boosting Machine" not in model_selection:
            model_selection.append("Light Gradient Boosting Machine")
            lgb_added = True

        include_models = [model_map[x] for x in list(model_selection)]

        # Check that models are selected - if none are selected, all models will be trained (undesired app behavior)
        if len(include_models) == 0 or include_models is None:
            raise Exception(
                "No models were selected. Please re-start the application")

        base_frame = pd.read_parquet("data/streamlit_data.parquet")
        state.genre_frame = base_frame.loc[lambda f: f["genre_1"].isin(genre)]
        labelled_frame = classify_success(state.genre_frame, users_threshold,
                                          success_threshold)

        train_frame, holdout_frame = create_holdout(
            labelled_frame, holdout_fraction=holdout_fraction)

        # PyCaret setup to train models
        if not state.experiment_complete:
            with st.spinner("Model Training in Progress"):
                if exb_added:
                    st.success(
                        "**Extreme Gradient Boosting Model** automatically added by default into model pipeline"
                    )
                if lgb_added:
                    st.success(
                        "**Light Gradient Boosting Machine Model** automatically added by default into model pipeline"
                    )
                setup(
                    data=train_frame,
                    numeric_features=MODEL_NUMERICAL_FEATURES,
                    categorical_features=MODEL_CATEGORICAL_FEATURES,
                    target="success_streaming_ratio_users",
                    ignore_features=["playlist_uri"],
                    test_data=holdout_frame,
                    session_id=123,
                    ignore_low_variance=True,
                    remove_outliers=True,
                    fix_imbalance=True,
                    remove_multicollinearity=True,
                    log_experiment=True,
                    log_data=True,
                    fold=2,
                    n_jobs=-1,
                    combine_rare_levels=True,
                    experiment_name=experiment_name,
                    silent=True,
                    feature_interaction=interactions,
                    feature_ratio=ratios,
                    polynomial_features=polynomials,
                )
                state.list_models = compare_models(n_select=5,
                                                   round=3,
                                                   cross_validation=False,
                                                   include=include_models)
                state.experiment_complete = True

                state.X_train = get_config(variable="X_train")
                state.y_train = get_config(variable="y_train")
                state.view = pd.merge(state.y_train,
                                      state.X_train,
                                      left_index=True,
                                      right_index=True).reset_index(drop=True)

        # Display model training results
        st.header("Model Training & Testing Results")
        exp = pull()
        st.dataframe(exp)
        st.info("**Models were trained using default parameters**")
        st.info(
            "To improve individual model performance,"
            "please consider offline **hyperparameter tuning** techniques such as **Grid Search**. "
            "To improve overall performance, please consider advanced offline **ensembling** techniques "
            "such as **Bagging**, **Boosting**, **Stacking**")

        # Model Definitions
        models_expander = st.beta_expander("Model Definitions")
        models_expander.write(
            "[**Decision Tree Classifier**](https://en.wikipedia.org/wiki/Decision_tree_learning)"
        )
        models_expander.write(
            "A Decision Tree is a simple representation for "
            "classifying examples, a form of Supervised Machine Learning where the data is "
            "continuously split according to a certain parameter. A decision tree starts with a "
            "single node, which branches into possible outcomes. Each of those outcomes "
            "leads to additional nodes, which branch off into other possibilities"
        )
        models_expander.write("")
        models_expander.write(
            "[**Random Forest Classifier**](https://en.wikipedia.org/wiki/Random_forest)"
        )
        models_expander.write(
            "An ensemble learning method"
            "that operates by constructing a multitude of decision trees at training time, "
            "where each tree is trained on a bootstrap replica of the training data and final "
            "model classification is decide via majority vote from the constituent trees"
        )
        models_expander.write("")
        models_expander.write(
            "[**Extra Trees Classifier**](https://quantdare.com/what-is-the-difference-between"
            "-extra-trees-and-random-forest/)")
        models_expander.write(
            "Extremely randomized trees is similar to Random Forest, "
            "in that it builds multiple trees and splits nodes using random subsets of features, "
            "but with two key differences: it does not bootstrap observations (meaning it samples "
            "without replacement), and nodes are split on random splits, not best splits"
        )
        models_expander.write("")
        models_expander.write(
            "[**Extreme Gradient Boosting**](https://en.wikipedia.org/wiki/Gradient_boosting)"
        )
        models_expander.write(
            "Boosting is a technique which combines a learning "
            "algorithm in series to achieve a strong learner from many sequentially connected "
            "weak learners. In case of gradient boosted decision trees algorithm, "
            "the weak learners are decision trees where each tree attempts to minimize the errors "
            "of previous tree. Trees in boosting are weak learners but adding many trees in series a"
            "and each focusing on the errors from previous one make boosting a "
            "highly efficient and accurate model")
        models_expander.write("")
        models_expander.write(
            "[**Light Gradient Boosting Machine**](https://lightgbm.readthedocs.io/en/latest/)"
        )
        models_expander.write(
            "A gradient boosting framework for machine "
            "learning originally developed by Microsoft. Similar to Extreme Gradient Boosting, "
            "it is based on decision tree algorithms, however unlike Extreme Gradient Boosting, "
            "the algorithm splits the tree leaf wise instead of level wise")
        models_expander.write("")

        # Model Evaluation Metrics
        metrics_expander = st.beta_expander("Model Evaluation Metrics")
        metrics_expander.write("**Accuracy**")
        metrics_expander.write(
            "Accuracy is defined as the percentage of correct predictions for the test data."
            " It can be calculated easily by dividing the number of correct predictions by the "
            "number of total predictions.")
        metrics_expander.write("")
        metrics_expander.write("**AUC**")
        metrics_expander.write(
            "An ROC curve (receiver operating characteristic curve) is a graph showing the "
            "performance of a classification model at all classification thresholds. This curve "
            "plots the True Positive Rate (TP) and False Negative Rate (FP)")
        metrics_expander.write("")
        metrics_expander.write("**Recall**")
        metrics_expander.write(
            "Recall is defined as the fraction of examples which were predicted to belong "
            "to a class with respect to all of the examples that truly belong in the class."
        )
        metrics_expander.write("")
        metrics_expander.write("**Precision**")
        metrics_expander.write(
            "Precision is defined as the fraction of relevant examples (true positives) among "
            "all of the examples which were predicted to belong in a certain class."
        )
        metrics_expander.write("")
        metrics_expander.write("**F1**")
        metrics_expander.write(
            "The traditional F-measure or balanced F-score (F1 score) is the harmonic mean "
            "of precision and recall and is calculated as --> F1 score = 2 * (Precision * Recall) / "
            "(Precision + Recall)")
        metrics_expander.write("")
        metrics_expander.write("**Kappa**")
        metrics_expander.write(
            "The Kappa statistic (or value) is a metric that compares an Observed Accuracy with "
            "an Expected Accuracy (random chance). The kappa statistic is used not only to evaluate "
            "a single classifier, but also to evaluate classifiers amongst themselves. In addition, "
            "it takes into account random chance (agreement with a random classifier), which"
            " generally means it is less misleading than simply using accuracy as a metric "
            "(an Observed Accuracy of 80% is a lot less impressive with an Expected Accuracy of "
            "75% versus an Expected Accuracy of 50%)")
        metrics_expander.write("")
        metrics_expander.write("**MCC**")
        metrics_expander.write(
            "Unlike the other metrics discussed above, MCC takes all the cells of the Confusion"
            " Matrix into consideration in its formula --> MCC = TP * TN – FP * FN / √ (TP +FP) * "
            "(TP + FN) * (TN + FP) * (TN + FN) .Similar to Correlation Coefficient, the range of "
            "values of MCC lie between -1 to +1. A model with a score of +1 is a perfect model "
            "and -1 is a poor model. This property is one of the key usefulness of MCC as it"
            " leads to easy interpretability.")
        metrics_expander.write("")

        # Additional model data
        opts = st.beta_expander("Additional Model Data", False)
        # Download the training data as an excel file
        if opts.button("Display Link to Download Model Training Data"):
            st.markdown(get_table_download_link(state.view),
                        unsafe_allow_html=True)

        # Prompt to launch MLFlow
        if opts.button("Display Link to Spotify Model Training History"):
            st.info(
                "Note that this application uses MLFlow only when both the application and MLFlow are "
                "deployed locally")

        # Overall importance ------------------------------------------------------------------------------------------
        st.write("")  # Intentional extra blank spaces
        st.write("")
        st.header(f"Success Drives for {selected_genre} Playlists")
        dict_models = {}
        for i, model in enumerate(exp.index):
            dict_models[model] = i

        user_selected_model = st.selectbox(
            "Select model to view feature importance:", exp.index)
        state.importance = st.checkbox("Click to calculate feature importance")
        if state.importance and state.experiment_complete:
            state.new_selected_model = state.list_models[
                dict_models[user_selected_model]]
            st.write("**Model parameters: **")
            st.write(state.new_selected_model)
            st.write("")
            st.write("**Generating Visualizations...**")
            bar = st.progress(0)

            if state.selected_model != state.new_selected_model:
                state.selected_model = state.new_selected_model
                state.explainer = shap.TreeExplainer(state.selected_model)
                state.shap_values = state.explainer.shap_values(
                    state.X_train.to_numpy())
            bar.progress(25)

            # Overall Feature Importance -------------------------------------------------------------------------
            st.subheader("Success Drivers - Average")
            st.pyplot(
                shap.summary_plot(state.shap_values,
                                  state.X_train,
                                  plot_type="bar"))

            # Violin plot and waterfall plot only available at this time for XGBoost model
            if user_selected_model != "xgboost":
                st.warning(
                    "This PoC has only been configured for when **Extreme Gradient Boosting "
                    "(xgboost)** is selected for analysis")
                bar.progress(100)
                st.stop()

            else:
                # Violin Feature Importance --------------------------------------------------------------------------
                st.subheader(
                    f"Success Drivers - All {selected_genre} Playlists")
                st.pyplot(shap.summary_plot(state.shap_values, state.X_train))
                bar.progress(50)

                # Dependence plots for each of the top 3 features ----------------------------------------------------
                st.header(f"Shapley Dependence for {selected_genre} Playlists")
                vals = np.abs(state.shap_values).mean(0)
                feature_importance = pd.DataFrame(
                    list(zip(state.X_train.columns, vals)),
                    columns=["col_name", "feature_importance_vals"],
                )
                feature_importance = (feature_importance.sort_values(
                    by=["feature_importance_vals"],
                    ascending=False).reset_index(drop=True).head(3))

                top_features = list(feature_importance["col_name"])
                for feature in top_features:
                    index = list(state.X_train.columns).index(feature)
                    st.subheader(f"Shapley Value Dependence for {feature}")
                    st.pyplot(
                        shap.dependence_plot(
                            index,
                            state.shap_values,
                            state.X_train,
                            alpha=0.5,
                            interaction_index=None,
                        ))
                bar.progress(70)

                # Individual importance -------------------------------------------------------------------------------
                st.header(
                    f"Explaining {selected_genre} Playlist Success Prediction")

                # Display the data frame for users to visually see the row they want to analyze
                st.subheader("Model Training Data")
                st.dataframe(state.view)
                state.new_row = int(
                    st.number_input(
                        "Row from dataframe to inspect",
                        min_value=0,
                        max_value=len(state.view),
                        value=10,
                    ))
                if state.row != state.new_row:
                    state.row = state.new_row
                    shap_object = ShapObject(
                        base_values=state.explainer.expected_value,
                        values=state.explainer.shap_values(
                            state.X_train)[state.row, :],
                        feature_names=state.X_train.columns,
                        data=state.X_train.iloc[state.row, :],
                    )
                    bar.progress(85)
                    st.subheader(
                        f"Feature Contributions to {selected_genre} Playlist #{state.row}"
                    )
                    st.pyplot(shap.waterfall_plot(shap_object))
                    bar.progress(100)
                    st.stop()
                else:
                    st.stop()
        else:
            st.stop()
    else:
        st.stop()
Beispiel #12
0
	predict_proba = RFModel.predict_proba(data_for_prediction_array)

	import warnings
	warnings.filterwarnings("ignore")
	# Create SHAP explainer
	explainer = shap.TreeExplainer(RFModel)	

	# Get shap values for observtation of interest
	shap_values = explainer.shap_values(data_for_prediction.values, check_additivity=False)

	decisionhtml = shap.decision_plot(base_value= explainer.expected_value[1], shap_values= shap_values[1], features= data_for_prediction, feature_names=data_for_prediction.columns.tolist(),show = False)
	plt.savefig('decisionPlot.pdf')
	plt.close()

	onedshap_values = shap_values[1].flatten()
	shap.waterfall_plot(explainer.expected_value[1], onedshap_values, feature_names=data_for_prediction.columns, max_display=10, show=False)
	plt.savefig('waterfallPlot.pdf')
	plt.close()

	# SHAP Plots for Class 1 (sRNA-mRNA Interaction)
	forcehtml = shap.force_plot(explainer.expected_value[1], shap_values[1], data_for_prediction)
	shap.save_html(out_file = 'forcePlot.html', full_html=False, plot = forcehtml)

	
elif((len(sys.argv) - 1) < 2):

	print("Error: Required parameters not passed! Please pass two parameters, sRNA ID and mRNA ID.")


else:
shap.force_plot(mlp_interpreter.explainer.expected_value[pred],
                mlp_shap_values[pred][sample],
                features=test_features[sample, 1:].numpy(),
                feature_names=feat_names)

# #### Sample decision plot

shap.decision_plot(mlp_interpreter.explainer.expected_value[pred],
                   mlp_shap_values[pred][sample],
                   features=test_features[sample, 1:].numpy(),
                   feature_names=feat_names)

# #### Sample waterfall plot

shap.waterfall_plot(mlp_interpreter.explainer.expected_value[pred],
                    mlp_shap_values[pred][sample],
                    features=test_features[sample, 1:].numpy(),
                    feature_names=feat_names)

test_features[:, 1:].shape

test_features[0].shape

test_features[0].unsqueeze(0).shape

# ## Interpreting XGBoost

# ### Loading the model

xgb_model = joblib.load(f'{models_path}xgb/checkpoint_27_01_2020_04_47.model')
xgb_model
Beispiel #14
0
def explain(client):
    st.header('Model Explainability')
    st.markdown("""<p style="text-align:justify;">
        Model explainability is the ability to explain the internal mechanics 
        of a model in human terms. It is an important tool to make reasoning 
        behind each decision in machine learning transparent and repeatable.<br>
        <a href="https://github.com/slundberg/shap" target="_blank">SHAP</a> 
        (SHapley Additive exPlanations) is a python module that uses game 
        theoretic approach to explain machine learning models. It will be 
        used to explore the explainability of all XGBoost models for 
        different wind farms.</p>""",
                unsafe_allow_html=True)

    farm_select = st.sidebar.selectbox('Select a farm', FARM_NAME_LIST)
    show_gif(icon='default')
    farm = FARM_LIST[FARM_NAME_LIST.index(farm_select)]
    df = load_data(client, farm, limit=200)
    models = load_models()
    model = models[farm]
    with st.spinner('Running Calculations...'):
        X, _ = transform_data(df)
        shap_val = model.get_booster().predict(DMatrix(X), pred_contribs=True)
        expected_val = shap_val[0][-1]
        shap_val = np.delete(shap_val, obj=-1, axis=1)

    col_name = [format_title(col) for col in list(X.columns)]

    importance = st.beta_expander('Feature importance based on SHAP value',
                                  expanded=True)
    importance.markdown("""<p style="text-align:justify;">
        The following plot summarizes feature importance based on SHAP 
            The following plot summarizes feature importance based on SHAP 
        The following plot summarizes feature importance based on SHAP 
        values (i.e. how much each feature changes the model outcome 
            values (i.e. how much each feature changes the model outcome 
        values (i.e. how much each feature changes the model outcome 
        when conditioning on that feature). The features are sorted by 
            when conditioning on that feature). The features are sorted by 
        when conditioning on that feature). The features are sorted by 
        the sum of the magnitudes of SHAP values. The colour represents 
            the sum of the magnitudes of SHAP values. The colour represents 
        the sum of the magnitudes of SHAP values. The colour represents 
        feature value, while red is high and blue is low.<br>
        For example, if a red (high feature value) data point shows a 
            For example, if a red (high feature value) data point shows a 
        For example, if a red (high feature value) data point shows a 
        positive SHAP value, it increases the predicted value; if the 
        SHAP value is negative, it lowers the predicted value. A point 
            SHAP value is negative, it lowers the predicted value. A point 
        SHAP value is negative, it lowers the predicted value. A point 
        far away from zero point also has a higher impact (either 
            far away from zero point also has a higher impact (either 
        far away from zero point also has a higher impact (either 
        negative or positive) than that is near zero point.</p>""",
                        unsafe_allow_html=True)

    summary_plot(shap_val, X, show=False, feature_names=col_name)
    importance.pyplot(bbox_inches='tight', dpi=150)

    contribution = st.beta_expander(
        'Feature contribution for individual prediction')
    contribution.markdown("""<p style="text-align:justify;">
        The waterfall plot below demonstrates how much each feature 
        contributes to pushing the model from the baseline value 
        (indicated by <i>E[f(X)]</i>) to model output (indicated 
        by <i>f(X)</i>) in an intuitive manner.<br>
        The plot can show all the individual predictions for today 
        & tomorrow (48 h in total). Use the slider below to choose 
        which hour's prediction you'd like to view.</p>""",
                          unsafe_allow_html=True)

    pred = df[-48:].reset_index(drop=True).prediction
    i = contribution.slider('Select the hour',
                            min_value=1,
                            max_value=48,
                            value=24)

    plt.rcParams.update({'font.size': 20})
    fig, ax = plt.subplots(figsize=(18, 2))
    ax.plot(range(1, 49), pred, c='#1e88e5')
    ax.scatter(i, pred[i - 1], c='#ff0d57', s=300)
    ax.xaxis.set_ticks(np.arange(0, 48, step=6))
    ax.set_xlim(1, 48)
    ax.set_xlabel('Hour')
    ax.tick_params(axis="y", direction="in", pad=-42)
    ax.get_yaxis().set_ticks([])

    contribution.pyplot(bbox_inches='tight', dpi=150, pad_inches=0.01)

    waterfall_plot(expected_val,
                   shap_val[-48:][i - 1],
                   feature_names=col_name,
                   max_display=10,
                   show=False)
    contribution.pyplot(bbox_inches='tight', dpi=150, pad_inches=0)