Example #1
0
def test_lime_explain_probabilistic(newsgroups_train):
    docs, y, target_names = newsgroups_train
    try:
        vec = HashingVectorizer(alternate_sign=False)
    except TypeError:
        # sklearn < 0.19
        vec = HashingVectorizer(non_negative=True)
    clf = MultinomialNB()

    X = vec.fit_transform(docs)
    clf.fit(X, y)
    print(clf.score(X, y))

    pipe = make_pipeline(vec, clf)
    doc = docs[0]

    te = TextExplainer(random_state=42)
    te.fit(doc, pipe.predict_proba)

    print(te.metrics_)
    assert te.metrics_['score'] > 0.7
    assert te.metrics_['mean_KL_divergence'] < 0.1

    res = te.explain_prediction(top=20, target_names=target_names)
    expl = format_as_text(res)
    print(expl)
    assert 'file' in expl
def explain_pred(sentence):
    te.fit(sentence, pipe.predict_proba)
    #txt = format_as_text(te.explain_prediction(target_names=["green", "neutral", "red"]))
    t_pred = te.explain_prediction(top=20,
                                   target_names=[
                                       "ANB",
                                       "CAP",
                                       "ECON",
                                       "EDU",
                                       "ENV",
                                       "EX",
                                       "FED",
                                       "HEG",
                                       "NAT",
                                       "POL",
                                       "TOP",
                                       "ORI",
                                       "QER",
                                       "COL",
                                   ])
    txt = format_as_text(t_pred)
    html = format_as_html(t_pred)
    html_file = open("latest_prediction.html", "a+")
    html_file.write(html)
    html_file.close()
    print(te.metrics_)
Example #3
0
def explain_prediction_me(x, model, feature_name_list):
    from eli5.explain import explain_prediction
    params = {}
    params['feature_names'] = feature_name_list
    params['top'] = 5
    expl = explain_prediction(model.get_booster(), x, **params)

    #expl.targets

    for target_explanation_i in range(len(expl.targets)):
        target_explanation = expl.targets[target_explanation_i]
        print "class " + str(
            target_explanation.target) + " probability: " + str(
                target_explanation.proba) + " score: " + str(
                    target_explanation.score)
        print "Positive:"
        for feature_weight in target_explanation.feature_weights.pos:
            print str(feature_weight.feature) + ": weight: " + str(
                feature_weight.weight) + " actual value: " + str(
                    feature_weight.value)
        print "Negative:"
        for feature_weight in target_explanation.feature_weights.neg:
            print str(feature_weight.feature) + ": weight: " + str(
                feature_weight.weight) + " actual value: " + str(
                    feature_weight.value)

    from eli5.formatters import format_as_text
    params_text = {}
    params_text['show_feature_values'] = True
    return format_as_text(expl, **params_text)
Example #4
0
def test_lime_explain_probabilistic(newsgroups_train):
    docs, y, target_names = newsgroups_train
    vec = HashingVectorizer(non_negative=True)
    clf = MultinomialNB()

    X = vec.fit_transform(docs)
    clf.fit(X, y)

    pipe = make_pipeline(vec, clf)
    doc = docs[0]

    clf_local, vec_local, metrics = get_local_pipeline_text(doc,
                                                            pipe.predict_proba,
                                                            n_samples=5000,
                                                            expand_factor=10)
    print(metrics)
    assert metrics['score'] > 0.7

    res = explain_prediction_sklearn(clf_local,
                                     doc,
                                     vec_local,
                                     top=10,
                                     target_names=target_names)
    expl = format_as_text(res)
    print(expl)
    assert 'file' in expl
Example #5
0
def assert_tree_explain_prediction_single_target(clf, X, feature_names):
    get_res = lambda _x, **kwargs: explain_prediction(
        clf, _x, feature_names=feature_names, **kwargs)
    res = get_res(X[0])
    for expl in format_as_all(res, clf):
        assert_feature_values_present(expl, feature_names, X[0])

    checked_flt = False
    all_expls = []
    for x in X[:5]:
        res = get_res(x)
        text_expl = format_as_text(res, show=fields.WEIGHTS)
        print(text_expl)
        assert '<BIAS>' in text_expl
        check_targets_scores(res)
        all_expls.append(text_expl)

        get_all = lambda fw: get_all_features(fw.pos) | get_all_features(fw.neg
                                                                         )
        all_features = get_all(res.targets[0].feature_weights)
        if len(all_features) > 1:
            f = list(all_features - {'<BIAS>'})[0]
            flt_res = get_res(x, feature_filter=lambda name, _: name != f)
            flt_features = get_all(flt_res.targets[0].feature_weights)
            assert flt_features == (all_features - {f})
            checked_flt = True

    assert checked_flt
    assert any(f in ''.join(all_expls) for f in feature_names)
Example #6
0
def test_format_html_options(force_weights, horizontal_layout):
    # test options that are not tested elsewhere
    X, y = make_regression(n_samples=100, n_targets=3, n_features=10,
                           random_state=42)
    reg = LinearRegression()
    reg.fit(X, y)
    res = explain_weights_sklearn(reg)
    kwargs = dict(
        force_weights=force_weights, horizontal_layout=horizontal_layout)
    postfix = '_' + '_'.join(
        '{}-{}'.format(k, v) for k, v in sorted(kwargs.items()))
    print(kwargs, postfix)
    # just check that it does not crash
    expl = format_as_html(res, **kwargs)
    write_html(reg, expl, format_as_text(res), postfix=postfix)
    pred_res = explain_prediction_sklearn(reg, X[0])
    pred_expl = format_as_html(pred_res, **kwargs)
    write_html(reg, pred_expl, format_as_text(pred_res),
               postfix='_expl' + postfix)
Example #7
0
 def explain_prediction(self, x, column_id, feature_names):
     from eli5.explain import explain_prediction
     params = {}
     params['feature_names'] = feature_names
     params['top'] = 5
     expl = explain_prediction(self.model[column_id], x, **params)
     from eli5.formatters import format_as_text
     params_text = {}
     params_text['show_feature_values'] = True
     return format_as_text(expl, **params_text)
 def explain_prediction(self, x, model):
     from eli5.explain import explain_prediction
     params = {}
     params['feature_names'] = self.feature_name_list
     params['top'] = 5
     expl = explain_prediction(model, x, **params)
     from eli5.formatters import format_as_text
     params_text = {}
     params_text['show_feature_values'] = True
     return format_as_text(expl, **params_text)
Example #9
0
File: utils.py Project: soprof/eli5
def format_as_all(res, clf, **kwargs):
    """ Format explanation as text and html, check JSON-encoding,
    print text explanation, save html, return text and html.
    """
    expl_dict = format_as_dict(res)
    pprint(expl_dict)
    json.dumps(expl_dict)  # check that it can be serialized to JSON
    expl_text = format_as_text(res, **kwargs)
    expl_html = format_as_html(res, **kwargs)
    print(expl_text)
    write_html(clf, expl_html, expl_text, caller_depth=2)
    return expl_text, expl_html
Example #10
0
def visualize_model(dataSet, column_id, final_gb, feature_name_list, train,
                    target_run, res):
    try:
        column_name = dataSet.clean_pd.columns[column_id]

        feature_name_list_err_corr = list(feature_name_list)
        print "missing features: " + str(
            len(final_gb[column_id].feature_names) - len(feature_name_list))

        if len(final_gb[column_id].feature_names) - len(feature_name_list) > 0:
            for err_corr_id in range(dataSet.shape[1]):
                if dataSet.is_column_applicable(
                        err_corr_id) and err_corr_id != column_id:
                    feature_name_list_err_corr.append(
                        "error_corr_" +
                        str(dataSet.clean_pd.columns[err_corr_id]))

        directory = Config.get("logging.folder") + '/out/html/' + dataSet.name
        if not os.path.exists(directory):
            os.makedirs(directory)
        path = directory + '/' + str(column_name) + '_' + str(
            train[column_id].shape[0]) + '_' + str(time.time()) + '.html'

        table_content = show_weights(final_gb[column_id],
                                     feature_names=feature_name_list_err_corr,
                                     importance_type="gain").data

        # print table_content
        from ml.VisualizeSVD import replace_with_url

        table_content = replace_with_url(table_content, dataSet)

        url = 'file://' + path
        html = "<h1>" + str(column_name) + "</h1>"
        html += "<h2>number of labels: " + str(
            train[column_id].shape[0]) + "</h2>"
        html += "<h2>F-Score: " + str(f1_score(target_run,
                                               res[column_id])) + "</h2>"
        html += str(table_content)

        with open(path, 'w') as webf:
            webf.write(html)
        webf.close()
        # webbrowser.open(url)
    except jinja2.exceptions.UndefinedError:
        print(
            format_as_text(
                explain_weights(final_gb[column_id],
                                feature_names=feature_name_list)))
Example #11
0
    def train(self, X_train, y_train):
        logger.info("\n{}".format(X_train.dtypes))
        features = self.get_pipe()
        # logger.info(features.get_feature_names())
        # features.fit_transform(X_train)
        pipe = make_pipeline(features, LGBMClassifier())
        logger.info(features.get_params().keys())
        pipe.fit(X_train, y_train)
        scores = cross_val_score(pipe, X_train, y_train, cv=5, scoring='accuracy')
        logger.info("Validation Accuracy: {:.3f} ± {:.3f}".format(np.mean(scores), 2 * np.std(scores)))

        clf = pipe.steps[1][1]
        ft = pipe.steps[0][1]
        self.print_importances(clf, X_train)
        logger.info(format_as_text(explain_weights_lightgbm(lgb=clf, vec=ft)))
        return pipe
Example #12
0
def get_eli5_weights(model: BaseModel, docs: List):
    """ Return eli5 feature weights (as a dict) with added color info.
    """
    logging.info('explaining weights')
    try:
        expl = model.explain_predictions(docs)
    except NotImplementedError:
        expl = model.explain_weights()
    logging.info('model weights:\n{}'.format(
        format_as_text(expl, show=fields.WEIGHTS)))

    if expl.targets:
        weights = expl.targets[0].feature_weights
        weight_range = get_weight_range(weights)
        for w_lst in [weights.pos, weights.neg]:
            w_lst[:] = [{
                'feature':
                fw.feature,
                'weight':
                fw.weight,
                'hsl_color':
                format_hsl(weight_color_hsl(fw.weight, weight_range)),
            } for fw in w_lst]
        weights.neg.reverse()
        return format_as_dict(weights)
    elif expl.feature_importances:
        importances = expl.feature_importances.importances
        weight_range = max_or_0(abs(fw.weight) for fw in importances)
        return {
            'pos': [{
                'feature':
                fw.feature,
                'weight':
                float(fw.weight),
                'hsl_color':
                format_hsl(weight_color_hsl(fw.weight, weight_range)),
            } for fw in importances],
            'neg': [],
            'pos_remaining':
            int(expl.feature_importances.remaining),
            'neg_remaining':
            0,
        }
    else:
        return {}
def assert_explain_prediction_single_target(estimator, X, feature_names):
    get_res = lambda _x, **kwargs: explain_prediction(
        estimator, _x, feature_names=feature_names, **kwargs)
    res = get_res(X[0])
    for expl in format_as_all(res, estimator):
        assert_feature_values_present(expl, feature_names, X[0])

    # take first elements in the dataset; check that
    # 1. <BIAS> feature is present;
    # 2. scores have correct absolute values;
    # 3. feature filter function works.
    checked_flt = False
    all_expls = []
    for x in X[:5]:
        res = get_res(x)
        text_expl = format_as_text(res, show=fields.WEIGHTS)
        print(text_expl)
        assert '<BIAS>' in text_expl
        check_targets_scores(res)
        all_expls.append(text_expl)
        checked_flt = checked_flt or _assert_feature_filter_works(get_res, x)

    assert checked_flt
    assert any(f in ''.join(all_expls) for f in feature_names)
Example #14
0
        from eli5.formatters import format_as_text
        from eli5 import explain_weights
        import jinja2

        path = '/home/felix/SequentialPatternErrorDetection/html/fpredict/model.html'
        url = 'file://' + path
        html = show_weights(final,
                            feature_names=feature_names,
                            importance_type="gain").data

        with open(path, 'w') as webf:
            webf.write(html)
        webf.close()
        # webbrowser.open(url)
    except jinja2.exceptions.UndefinedError:
        print format_as_text(
            explain_weights(final, feature_names=feature_names))

importances = final.get_score(importance_type='gain')
print importances

sorted_x = sorted(importances.items(),
                  key=operator.itemgetter(1),
                  reverse=True)
print sorted_x

labels = []
score = []
t = 0
for key, value in sorted_x:
    labels.append(key)
    score.append(value)
            html = "<h1>" + column_name + "</h1>"
            html += "<h2>number of labels: " + str(
                train[column_id].shape[0]) + "</h2>"
            html += "<h2>F-Score: " + str(f1_score(target_run,
                                                   res[column_id])) + "</h2>"
            html += show_weights(final_gb[column_id],
                                 feature_names=feature_name_list,
                                 importance_type="gain").data

            with open(path, 'w') as webf:
                webf.write(html)
            webf.close()
            #webbrowser.open(url)
        except jinja2.exceptions.UndefinedError:
            print format_as_text(
                explain_weights(final_gb[column_id],
                                feature_names=feature_name_list))

        print "current train shape: " + str(train[column_id].shape)

        print "column: " + str(column_id)
        print_stats(target_run, res[column_id])
        print_stats_whole(dataSet.matrix_is_error[0:split_id, :],
                          all_error_status, "run all")
        if all_matrix_test != None:
            print_stats_whole(
                dataSet.matrix_is_error[split_id:dataSet.shape[0], :],
                all_error_status_test, "test general")

        number_samples = 0
        for key, value in train.iteritems():
Example #16
0
def eval_clf(arg,
             text_features,
             ys,
             vec_filename,
             show_features=False,
             n_best_features=None,
             save=None):

    fold_idx, (train_idx, test_idx) = arg
    if fold_idx == 0:
        print('{} in train, {} in test'.format(len(train_idx), len(test_idx)))
    text_pipeline, text_clf = make_text_pipeline()
    text_pipeline.fit(text_features[train_idx], ys[train_idx])
    vec = load_vec(vec_filename)
    if show_features and fold_idx == 0:
        print(format_as_text(explain_weights(text_clf, vec, top=(100, 20))))
    result_metrics = {}
    test_y = ys[test_idx]
    if n_best_features:

        if len(test_idx):
            pred_y = text_pipeline.predict_proba(text_features[test_idx])[:, 1]
            result_metrics.update({
                'PR AUC (all text features)':
                metrics.average_precision_score(test_y, pred_y),
                'ROC AUC (all text features)':
                metrics.roc_auc_score(test_y, pred_y),
            })
        coef = sorted(enumerate(text_clf.coef_[0]),
                      key=lambda x: abs(x[1]),
                      reverse=True)
        best_feature_indices = [
            idx for idx, weight in coef[:n_best_features] if weight != 0
        ]
        result_metrics['selected_features'] = len(best_feature_indices)
        text_features = text_features[:, best_feature_indices]
        text_pipeline, text_clf = make_text_pipeline()
        text_pipeline.fit(text_features[train_idx], ys[train_idx])
        inverse = {idx: w for w, idx in vec.vocabulary_.items()}
        vec.vocabulary_ = {
            inverse[idx]: i
            for i, idx in enumerate(best_feature_indices)
        }
        vec.stop_words_ = None
        if show_features and fold_idx == 0:
            print(format_as_text(explain_weights(text_clf, vec,
                                                 top=(100, 20))))

    if len(test_idx):
        text_features_test = text_features[test_idx]
        pred_y = text_pipeline.predict_proba(text_features_test)[:, 1]
        result_metrics.update({
            'PR AUC':
            metrics.average_precision_score(test_y, pred_y),
            'ROC AUC':
            metrics.roc_auc_score(test_y, pred_y),
        })
    if save:
        pipeline = Pipeline([
            ('html_to_item', _function_transformer(html_to_item)),
            ('item_to_text', _function_transformer(item_to_text)),
            ('vec', vec),
        ] + text_pipeline.steps)
        Soft404Classifier.save_model(save, pipeline)
    return result_metrics