def eli5visual(pData, pDesc, Idx, pAccountName, pVec, nTopKeywrd, pRootDir): try: for i in range(len(Idx)): if Idx[i] <= len(pData): pIntent = pData['Intent'][int(Idx[i])] _, pModels = loadmodel(pRootDir, pAccountName, pIntent) pPipeModel = make_pipeline(pVec, pModels) pTe = TextExplainer(random_state=42).fit( pData[pDesc][int(Idx[i])], pPipeModel.predict_proba) pExplanation = pTe.explain_prediction() pHtml = format_as_html(pExplanation, force_weights=False, include_styles=False, horizontal_layout=True, show_feature_values=False) savehtml(pRootDir, pHtml, Idx[i], pIntent) else: print("Please select valid Id") except Exception as e: print( '*** ERROR[003]: Error in visualiation file of eil5visual function: ', sys.exc_info()[0], str(e)) print(traceback.format_exc()) return (-1) return (0)
def test_lime_explain_probabilistic(newsgroups_train): docs, y, target_names = newsgroups_train try: vec = HashingVectorizer(alternate_sign=False) except TypeError: # sklearn < 0.19 vec = HashingVectorizer(non_negative=True) clf = MultinomialNB() X = vec.fit_transform(docs) clf.fit(X, y) print(clf.score(X, y)) pipe = make_pipeline(vec, clf) doc = docs[0] te = TextExplainer(random_state=42) te.fit(doc, pipe.predict_proba) print(te.metrics_) assert te.metrics_['score'] > 0.7 assert te.metrics_['mean_KL_divergence'] < 0.1 res = te.explain_prediction(top=20, target_names=target_names) expl = format_as_text(res) print(expl) assert 'file' in expl
def _lime_analyze(self, query, indicies, max_len, max_replace, top_targets=None): model = self.model vocab = self.vocab.word_to_idx label = self.label.word_to_idx prepro_query = self.preprocess(query) explainer_generator = ExplainerGenerator(model, vocab, max_len) sampler = MaskingTextSampler(replacement=UNK, max_replace=max_replace, token_pattern=None, bow=False) explainer_list = list() for i in indicies: predict_fn = explainer_generator.get_predict_function(i) te = TextExplainer( sampler=sampler, position_dependent=True, random_state=RANDOM_SEED, ) te.fit(' '.join(prepro_query), predict_fn) pred_explain = te.explain_prediction( target_names=[l for l in label][3:], top_targets=top_targets) explainer_list.append(pred_explain) return explainer_list
def test_text_explainer_position_dependent(): text = "foo bar baz egg spam bar baz egg spam ham" @_apply_to_list def predict_proba(doc): tokens = doc.split() # 'bar' is only important in the beginning of the document, # not in the end return [0, 1] if len(tokens) >= 2 and tokens[1] == 'bar' else [1, 0] # bag of words model is not powerful enough to explain predict_proba above te = TextExplainer(random_state=42, vec=CountVectorizer()) te.fit(text, predict_proba) print(te.metrics_) assert te.metrics_['score'] < 0.9 assert te.metrics_['mean_KL_divergence'] > 0.3 # position_dependent=True can make it work te = TextExplainer(position_dependent=True, random_state=42) te.fit(text, predict_proba) print(te.metrics_) assert te.metrics_['score'] > 0.95 assert te.metrics_['mean_KL_divergence'] < 0.3 expl = te.explain_prediction() format_as_all(expl, te.clf_) # it is also possible to almost make it work using a custom vectorizer vec = CountVectorizer(ngram_range=(1, 2)) te = TextExplainer(vec=vec, random_state=42) te.fit(text, predict_proba) print(te.metrics_) assert te.metrics_['score'] > 0.95 assert te.metrics_['mean_KL_divergence'] < 0.3 expl = te.explain_prediction() format_as_all(expl, te.clf_) # custom vectorizers are not supported when position_dependent is True with pytest.raises(ValueError): te = TextExplainer(position_dependent=True, vec=HashingVectorizer())
def test_text_explainer_token_pattern(): text = "foo-bar baz egg-spam" predict_proba = substring_presence_predict_proba('bar') # a different token_pattern te = TextExplainer(token_pattern=r'(?u)\b[-\w]+\b') te.fit(text, predict_proba) print(te.metrics_) assert te.metrics_['score'] > 0.95 assert te.metrics_['mean_KL_divergence'] < 0.1 expl = te.explain_prediction() format_as_all(expl, te.clf_) assert expl.targets[0].feature_weights.pos[0].feature == 'foo-bar'
def predict(model_id): if os.path.exists("model/" + str(int(model_id)) + ".pkl"): try: if str(model_id) in clfs: clf = clfs[str(model_id)] else: clf = joblib.load(model_file_name(model_id)) explainers = [] if is_text_type(model_id): pipe = make_pipeline(vectorizer, clf) prediction = pipe.predict(request.json) for post in request.json: te = TextExplainer(random_state=42, n_samples=500) te.fit(post['text'], pipe.predict_proba) made = te.explain_prediction(target_names=['pos', 'neg']) explanation = made.targets[0].feature_weights op_exp = {'pos': [], 'neg': []} for feature in explanation.pos: op_exp['pos'].append([feature.feature, feature.weight]) for feature in explanation.neg: op_exp['neg'].append([feature.feature, feature.weight]) explainers.append(op_exp) else: rows = request.json query = pd.get_dummies(pd.DataFrame(rows)) query = query.reindex(columns=model_columns, fill_value=0) prediction = clf.predict(query) for index, row in query.iterrows(): explanation = eli5.explain_prediction( clf, row).targets[0].feature_weights op_exp = {'pos': [], 'neg': []} for feature in explanation.pos: op_exp['pos'].append([feature.feature, feature.weight]) for feature in explanation.neg: op_exp['neg'].append([feature.feature, feature.weight]) explainers.append(op_exp) # Converting to int from int64 return jsonify({ "predictions": list(map(str, prediction)), "explanations": explainers }) except Exception as e: return jsonify({'error': str(e), 'trace': traceback.format_exc()}) else: print('train first') return 'no model here'
def test_text_explainer_char_based(token_pattern): text = "Hello, world!" predict_proba = substring_presence_predict_proba('lo') te = TextExplainer(char_based=True, token_pattern=token_pattern) te.fit(text, predict_proba) print(te.metrics_) assert te.metrics_['score'] > 0.95 assert te.metrics_['mean_KL_divergence'] < 0.1 res = te.explain_prediction() format_as_all(res, te.clf_) check_targets_scores(res) assert res.targets[0].feature_weights.pos[0].feature == 'lo' # another way to look at results (not that useful for char ngrams) res = te.explain_weights() assert res.targets[0].feature_weights.pos[0].feature == 'lo'
def test_text_explainer_custom_classifier(): text = "foo-bar baz egg-spam" predict_proba = substring_presence_predict_proba('bar') # use decision tree to explain the prediction te = TextExplainer(clf=DecisionTreeClassifier(max_depth=2)) te.fit(text, predict_proba) print(te.metrics_) assert te.metrics_['score'] > 0.99 assert te.metrics_['mean_KL_divergence'] < 0.01 expl = te.explain_prediction() format_as_all(expl, te.clf_) # with explain_weights we can get a nice tree representation expl = te.explain_weights() print(expl.decision_tree.tree) assert expl.decision_tree.tree.feature_name == "bar" format_as_all(expl, te.clf_)
def test_lime_flat_neighbourhood(newsgroups_train): docs, y, target_names = newsgroups_train doc = docs[0] @_apply_to_list def predict_proba(doc): """ This function predicts non-zero probabilities only for 3 labels """ proba_graphics = [0, 1.0, 0, 0] proba_other = [0.9, 0, 0.1, 0] return proba_graphics if 'file' in doc else proba_other te = TextExplainer(expand_factor=None, random_state=42) te.fit(doc, predict_proba) print(te.metrics_) print(te.clf_.classes_, target_names) res = te.explain_prediction(top=20, target_names=target_names) for expl in format_as_all(res, te.clf_): assert 'file' in expl assert "comp.graphics" in expl
def st_lime_explanation( text: str, predict_func: Callable[[List[str]], np.ndarray], unique_labels: List[str], n_samples: int, position_dependent: bool = True, ): # TODO just use ELI5's built-in visualization when streamlit supports it: # https://github.com/streamlit/streamlit/issues/779 with st.spinner("Generating LIME explanations..."): te = TextExplainer( random_state=1, n_samples=n_samples, position_dependent=position_dependent ) te.fit(text, predict_func) st.json(te.metrics_) explanation = te.explain_prediction() explanation_df = eli5.format_as_dataframe(explanation) for target_ndx, target in enumerate( sorted(explanation.targets, key=lambda t: -t.proba) ): target_explanation_df = explanation_df[ explanation_df["target"] == target_ndx ].copy() target_explanation_df["contribution"] = ( target_explanation_df["weight"] * target_explanation_df["value"] ) target_explanation_df["abs_contribution"] = abs( target_explanation_df["contribution"] ) target_explanation_df = ( target_explanation_df.drop("target", axis=1) .sort_values(by="abs_contribution", ascending=False) .reset_index(drop=True) ) st.subheader( f"Target: {unique_labels[target_ndx]} (probability {target.proba:.4f}, score {target.score:.4f})" ) st.dataframe(target_explanation_df)
# opcodes_dir = '/home/hwangdz/coreutils/coreutils-8.28/install_m32/bin/md5funcs_ops' opcodes_dir = '/home/hwangdz/git/rl-select-div/only-similarity/explanation/%s_ops_info' % bin_name output_dir = 'explanation/%s_html' % bin_name if not os.path.isdir(output_dir): os.mkdir(output_dir) for file_name in os.listdir(opcodes_dir): # if file_name != 'dump.s': # continue if file_name == 'op_distribution': continue file_path = os.path.join(opcodes_dir, file_name) with open(file_path, 'r') as f: op_codes = f.read() if len(op_codes) < 20: continue num_ops = len(op_codes.split()) op_codes = op_codes.replace('\n', ' ') opcode_explainer = TextExplainer(random_state=59, sampler=ops_sampler, n_samples=5000) #repeat_times = (len(op_codes.split()) / 100) ** 2 repeat_times = 1 for _ in range(repeat_times): opcode_explainer.fit(op_codes, ss.predict_proba) explanation = opcode_explainer.explain_prediction()._repr_html_() with open('explanation/%s_html/explanation-%s.html' % (bin_name, file_name), 'w') as ef: ef.write(explanation) ef.write('num of opcodes: %d\n' % num_ops) ef.write('</br>\n') ef.write(op_codes)