Ejemplo n.º 1
0
    def explain(self, doc, truncate_len=512, all_targets=False):
        """
        Highlights text to explain prediction
        Args:
            doc (str): text of documnet
            truncate_len(int): truncate document to this many words
            all_targets(bool):  If True, show visualization for
                                each target.
        """
        try:
            import eli5
            from eli5.lime import TextExplainer
        except:
            msg = 'ktrain requires a forked version of eli5 to support tf.keras. '+\
                  'Install with: pip3 install git+https://github.com/amaiya/eli5@tfkeras_0_10_1'
            warnings.warn(msg)
            return

        prediction = [self.predict(doc)] if not all_targets else None

        if not isinstance(doc, str): raise Exception('text must of type str')
        if self.preproc.is_nospace_lang():
            doc = self.preproc.process_chinese([doc])
            doc = doc[0]
        doc = ' '.join(doc.split()[:truncate_len])
        te = TextExplainer(random_state=42)
        _ = te.fit(doc, self.predict_proba)
        return te.show_prediction(target_names=self.preproc.get_classes(),
                                  targets=prediction)
Ejemplo n.º 2
0
    def explain(self,
                doc,
                truncate_len=512,
                all_targets=False,
                n_samples=2500):
        """
        Highlights text to explain prediction
        Args:
            doc (str): text of documnet
            truncate_len(int): truncate document to this many words
            all_targets(bool):  If True, show visualization for
                                each target.
            n_samples(int): number of samples to generate and train on.
                            Larger values give better results, but will take more time.
                            Lower this value if explain is taking too long.
        """
        is_array, is_pair = detect_text_format(doc)
        if is_pair:
            warnings.warn(
                "currently_unsupported: explain does not currently support sentence pair classification"
            )
            return
        if not self.c:
            warnings.warn(
                "currently_unsupported: explain does not support text regression"
            )
            return
        try:
            import eli5
            from eli5.lime import TextExplainer
        except:
            msg = (
                "ktrain requires a forked version of eli5 to support tf.keras. "
                +
                "Install with: pip install https://github.com/amaiya/eli5/archive/refs/heads/tfkeras_0_10_1.zip"
            )
            warnings.warn(msg)
            return
        if (not hasattr(eli5, "KTRAIN_ELI5_TAG")
                or eli5.KTRAIN_ELI5_TAG != KTRAIN_ELI5_TAG):
            msg = (
                "ktrain requires a forked version of eli5 to support tf.keras. It is either missing or not up-to-date. "
                +
                "Uninstall the current version and install/re-install the fork with: pip install https://github.com/amaiya/eli5/archive/refs/heads/tfkeras_0_10_1.zip"
            )
            warnings.warn(msg)
            return

        if not isinstance(doc, str):
            raise TypeError("text must of type str")
        prediction = [self.predict(doc)] if not all_targets else None

        if self.preproc.is_nospace_lang():
            doc = self.preproc.process_chinese([doc])
            doc = doc[0]
        doc = " ".join(doc.split()[:truncate_len])
        te = TextExplainer(random_state=42, n_samples=n_samples)
        _ = te.fit(doc, self.predict_proba)
        return te.show_prediction(target_names=self.preproc.get_classes(),
                                  targets=prediction)
Ejemplo n.º 3
0
def eli5visual(pData, pDesc, Idx, pAccountName, pVec, nTopKeywrd, pRootDir):
    try:
        for i in range(len(Idx)):
            if Idx[i] <= len(pData):
                pIntent = pData['Intent'][int(Idx[i])]
                _, pModels = loadmodel(pRootDir, pAccountName, pIntent)
                pPipeModel = make_pipeline(pVec, pModels)
                pTe = TextExplainer(random_state=42).fit(
                    pData[pDesc][int(Idx[i])], pPipeModel.predict_proba)
                pExplanation = pTe.explain_prediction()
                pHtml = format_as_html(pExplanation,
                                       force_weights=False,
                                       include_styles=False,
                                       horizontal_layout=True,
                                       show_feature_values=False)
                savehtml(pRootDir, pHtml, Idx[i], pIntent)
            else:
                print("Please select valid Id")

    except Exception as e:
        print(
            '*** ERROR[003]: Error in visualiation file of eil5visual function: ',
            sys.exc_info()[0], str(e))
        print(traceback.format_exc())
        return (-1)
    return (0)
Ejemplo n.º 4
0
def main():
    df = pd.read_excel('data/mr_vs_fr_30.xlsx')
    df = df.sample(frac=1, random_state=seed)

    df['text_lemmatized'] = df['text'].apply(morphText)

    X_train, X_test, y_train, y_test = train_test_split(
        df['text_lemmatized'], df['label'], test_size=0.3, random_state=42, stratify=df['label'])

    flag_test = True
    get_pipe(X_train, y_train, flag_test, X_test, y_test)

    flag_test = False
    pipe = get_pipe(df['text_lemmatized'], df['label'], flag_test)

    k = 0
    words = []
    for index, row in df.iterrows():
        te5 = TextExplainer(clf=DecisionTreeClassifier(max_depth=5), random_state=seed)
        te5.fit(row['text_lemmatized'], pipe.predict_proba)
        df_eli5_w = eli5.format_as_dataframe(te5.explain_weights())
        print('class {}'.format('male' if row['label'] == 0 else 'woman'))
        print('predict:')
        print(df_eli5_w)
        print(100*'*')
        temp_m = ', '.join(df_eli5_w[df_eli5_w['weight'] > 0]['feature'].tolist())
        if temp_m:
            words.append(temp_m)
        else:
            words.append('')
        k += 1

    df['words'] = words
    df.to_excel('mr_vs_fr_words_30.xlsx', index=False)
Ejemplo n.º 5
0
def test_lime_explain_probabilistic(newsgroups_train):
    docs, y, target_names = newsgroups_train
    try:
        vec = HashingVectorizer(alternate_sign=False)
    except TypeError:
        # sklearn < 0.19
        vec = HashingVectorizer(non_negative=True)
    clf = MultinomialNB()

    X = vec.fit_transform(docs)
    clf.fit(X, y)
    print(clf.score(X, y))

    pipe = make_pipeline(vec, clf)
    doc = docs[0]

    te = TextExplainer(random_state=42)
    te.fit(doc, pipe.predict_proba)

    print(te.metrics_)
    assert te.metrics_['score'] > 0.7
    assert te.metrics_['mean_KL_divergence'] < 0.1

    res = te.explain_prediction(top=20, target_names=target_names)
    expl = format_as_text(res)
    print(expl)
    assert 'file' in expl
Ejemplo n.º 6
0
    def _lime_analyze(self,
                      query,
                      indicies,
                      max_len,
                      max_replace,
                      top_targets=None):
        model = self.model
        vocab = self.vocab.word_to_idx
        label = self.label.word_to_idx
        prepro_query = self.preprocess(query)

        explainer_generator = ExplainerGenerator(model, vocab, max_len)

        sampler = MaskingTextSampler(replacement=UNK,
                                     max_replace=max_replace,
                                     token_pattern=None,
                                     bow=False)

        explainer_list = list()
        for i in indicies:
            predict_fn = explainer_generator.get_predict_function(i)

            te = TextExplainer(
                sampler=sampler,
                position_dependent=True,
                random_state=RANDOM_SEED,
            )

            te.fit(' '.join(prepro_query), predict_fn)

            pred_explain = te.explain_prediction(
                target_names=[l for l in label][3:], top_targets=top_targets)
            explainer_list.append(pred_explain)

        return explainer_list
Ejemplo n.º 7
0
def test_text_explainer_rbf_sigma():
    text = 'foo bar baz egg spam'
    predict_proba = substring_presence_predict_proba('bar')

    te1 = TextExplainer().fit(text, predict_proba)
    te2 = TextExplainer(rbf_sigma=0.1).fit(text, predict_proba)
    te3 = TextExplainer(rbf_sigma=1.0).fit(text, predict_proba)

    assert te1.similarity_.sum() < te3.similarity_.sum()
    assert te1.similarity_.sum() > te2.similarity_.sum()
Ejemplo n.º 8
0
def test_text_explainer_token_pattern():
    text = "foo-bar baz egg-spam"
    predict_proba = substring_presence_predict_proba('bar')

    # a different token_pattern
    te = TextExplainer(token_pattern=r'(?u)\b[-\w]+\b')
    te.fit(text, predict_proba)
    print(te.metrics_)
    assert te.metrics_['score'] > 0.95
    assert te.metrics_['mean_KL_divergence'] < 0.1
    expl = te.explain_prediction()
    format_as_all(expl, te.clf_)

    assert expl.targets[0].feature_weights.pos[0].feature == 'foo-bar'
Ejemplo n.º 9
0
def predict(model_id):
    if os.path.exists("model/" + str(int(model_id)) + ".pkl"):
        try:
            if str(model_id) in clfs:
                clf = clfs[str(model_id)]
            else:
                clf = joblib.load(model_file_name(model_id))
            explainers = []
            if is_text_type(model_id):
                pipe = make_pipeline(vectorizer, clf)
                prediction = pipe.predict(request.json)

                for post in request.json:
                    te = TextExplainer(random_state=42, n_samples=500)
                    te.fit(post['text'], pipe.predict_proba)
                    made = te.explain_prediction(target_names=['pos', 'neg'])
                    explanation = made.targets[0].feature_weights
                    op_exp = {'pos': [], 'neg': []}
                    for feature in explanation.pos:
                        op_exp['pos'].append([feature.feature, feature.weight])
                    for feature in explanation.neg:
                        op_exp['neg'].append([feature.feature, feature.weight])
                    explainers.append(op_exp)
            else:
                rows = request.json
                query = pd.get_dummies(pd.DataFrame(rows))
                query = query.reindex(columns=model_columns, fill_value=0)
                prediction = clf.predict(query)
                for index, row in query.iterrows():
                    explanation = eli5.explain_prediction(
                        clf, row).targets[0].feature_weights
                    op_exp = {'pos': [], 'neg': []}
                    for feature in explanation.pos:
                        op_exp['pos'].append([feature.feature, feature.weight])
                    for feature in explanation.neg:
                        op_exp['neg'].append([feature.feature, feature.weight])
                    explainers.append(op_exp)

            # Converting to int from int64
            return jsonify({
                "predictions": list(map(str, prediction)),
                "explanations": explainers
            })

        except Exception as e:

            return jsonify({'error': str(e), 'trace': traceback.format_exc()})
    else:
        print('train first')
        return 'no model here'
Ejemplo n.º 10
0
def highlight_text(text):
    predict_dict = predict(text)

    try:
        te = TextExplainer(random_state=42, n_samples=1000)
        te.fit(text, nn_model.predict_proba)
        highlight_html = te.show_prediction(
            target_names=[val for val in CLASSES.values()],
            top_targets=3,
            top=200)
        predict_dict["highlight"] = highlight_html
    except:
        predict_dict["highlight"] = None

    return predict_dict
Ejemplo n.º 11
0
    def explain(self,
                doc,
                truncate_len=512,
                all_targets=False,
                n_samples=2500):
        """
        Highlights text to explain prediction
        Args:
            doc (str): text of documnet
            truncate_len(int): truncate document to this many words
            all_targets(bool):  If True, show visualization for
                                each target.
            n_samples(int): number of samples to generate and train on.
                            Larger values give better results, but will take more time.
                            Lower this value if explain is taking too long.
        """
        is_array, is_pair = detect_text_format(doc)
        if is_pair:
            warnings.warn(
                'currently_unsupported: explain does not currently support sentence pair classification'
            )
            return
        if not self.c:
            warnings.warn(
                'currently_unsupported: explain does not support text regression'
            )
            return
        try:
            import eli5
            from eli5.lime import TextExplainer
        except:
            msg = 'ktrain requires a forked version of eli5 to support tf.keras. '+\
                  'Install with: pip3 install git+https://github.com/amaiya/eli5@tfkeras_0_10_1'
            warnings.warn(msg)
            return

        prediction = [self.predict(doc)] if not all_targets else None

        if not isinstance(doc, str): raise Exception('text must of type str')
        if self.preproc.is_nospace_lang():
            doc = self.preproc.process_chinese([doc])
            doc = doc[0]
        doc = ' '.join(doc.split()[:truncate_len])
        te = TextExplainer(random_state=42, n_samples=n_samples)
        _ = te.fit(doc, self.predict_proba)
        return te.show_prediction(target_names=self.preproc.get_classes(),
                                  targets=prediction)
def limeTextClassification(
    dataset, data, pr=Predictor(callingFunction="TextClassifier")
):  # example retrieved from https://eli5.readthedocs.io/en/latest/tutorials/black-box-text-classifiers.html#textexplainer

    pr.dataset = dataset
    resultColumnName = pr.resultColumn
    dataClasses = list(dict.fromkeys(data[resultColumnName].astype(str)))
    dataClasses.sort()

    te = TextExplainer(random_state=42)
Ejemplo n.º 13
0
def test_lime_flat_neighbourhood(newsgroups_train):
    docs, y, target_names = newsgroups_train
    doc = docs[0]

    @_apply_to_list
    def predict_proba(doc):
        """ This function predicts non-zero probabilities only for 3 labels """
        proba_graphics = [0, 1.0, 0, 0]
        proba_other = [0.9, 0, 0.1, 0]
        return proba_graphics if 'file' in doc else proba_other

    te = TextExplainer(expand_factor=None, random_state=42)
    te.fit(doc, predict_proba)
    print(te.metrics_)
    print(te.clf_.classes_, target_names)

    res = te.explain_prediction(top=20, target_names=target_names)
    for expl in format_as_all(res, te.clf_):
        assert 'file' in expl
        assert "comp.graphics" in expl
Ejemplo n.º 14
0
def st_lime_explanation(
    text: str,
    predict_func: Callable[[List[str]], np.ndarray],
    unique_labels: List[str],
    n_samples: int,
    position_dependent: bool = True,
):
    # TODO just use ELI5's built-in visualization when streamlit supports it:
    # https://github.com/streamlit/streamlit/issues/779
    with st.spinner("Generating LIME explanations..."):
        te = TextExplainer(
            random_state=1, n_samples=n_samples, position_dependent=position_dependent
        )
        te.fit(text, predict_func)
    st.json(te.metrics_)
    explanation = te.explain_prediction()
    explanation_df = eli5.format_as_dataframe(explanation)
    for target_ndx, target in enumerate(
        sorted(explanation.targets, key=lambda t: -t.proba)
    ):
        target_explanation_df = explanation_df[
            explanation_df["target"] == target_ndx
        ].copy()

        target_explanation_df["contribution"] = (
            target_explanation_df["weight"] * target_explanation_df["value"]
        )
        target_explanation_df["abs_contribution"] = abs(
            target_explanation_df["contribution"]
        )
        target_explanation_df = (
            target_explanation_df.drop("target", axis=1)
            .sort_values(by="abs_contribution", ascending=False)
            .reset_index(drop=True)
        )
        st.subheader(
            f"Target: {unique_labels[target_ndx]} (probability {target.proba:.4f}, score {target.score:.4f})"
        )
        st.dataframe(target_explanation_df)
def limeTextClassification(
    dataset, data, pr=Predictor(callingFunction="TextClassifier")
):  # example retrieved from https://eli5.readthedocs.io/en/latest/tutorials/black-box-text-classifiers.html#textexplainer

    pr = Predictor(dataset=dataset, callingFunction="TextClassifier")
    resultColumnName = pr.resultColumn
    dataClasses = list(dict.fromkeys(data[resultColumnName].astype(str)))
    dataClasses.sort()
    pr = Predictor(dataset=dataset, callingFunction="TextClassifier")

    te = TextExplainer(random_state=42)
    te.fit(dataset["text"], pr.predict_proba)

    te.fit(dataset["text"], pr.predict_proba)
    te.show_prediction(target_names=pr._classes_000.tolist())

    return te, pr._classes_000.tolist()
    '''
Ejemplo n.º 16
0
def test_text_explainer_custom_classifier():
    text = "foo-bar baz egg-spam"
    predict_proba = substring_presence_predict_proba('bar')

    # use decision tree to explain the prediction
    te = TextExplainer(clf=DecisionTreeClassifier(max_depth=2))
    te.fit(text, predict_proba)
    print(te.metrics_)
    assert te.metrics_['score'] > 0.99
    assert te.metrics_['mean_KL_divergence'] < 0.01
    expl = te.explain_prediction()
    format_as_all(expl, te.clf_)

    # with explain_weights we can get a nice tree representation
    expl = te.explain_weights()
    print(expl.decision_tree.tree)
    assert expl.decision_tree.tree.feature_name == "bar"
    format_as_all(expl, te.clf_)
Ejemplo n.º 17
0
def test_text_explainer_char_based(token_pattern):
    text = "Hello, world!"
    predict_proba = substring_presence_predict_proba('lo')

    te = TextExplainer(char_based=True, token_pattern=token_pattern)
    te.fit(text, predict_proba)
    print(te.metrics_)
    assert te.metrics_['score'] > 0.95
    assert te.metrics_['mean_KL_divergence'] < 0.1

    res = te.explain_prediction()
    format_as_all(res, te.clf_)
    check_targets_scores(res)
    assert res.targets[0].feature_weights.pos[0].feature == 'lo'

    # another way to look at results (not that useful for char ngrams)
    res = te.explain_weights()
    assert res.targets[0].feature_weights.pos[0].feature == 'lo'
Ejemplo n.º 18
0
def test_text_explainer_show_methods():
    pytest.importorskip('IPython')
    from IPython.display import HTML

    text = "Hello, world!"

    @_apply_to_list
    def predict_proba(doc):
        return [0.0, 1.0] if 'lo' in doc else [1.0, 0.0]

    te = TextExplainer()
    te.fit(text, predict_proba)

    pred_expl = te.show_prediction()
    assert isinstance(pred_expl, HTML)
    assert 'lo' in pred_expl.data

    weight_expl = te.show_weights()
    assert isinstance(weight_expl, HTML)
    assert 'lo' in weight_expl.data
Ejemplo n.º 19
0
train = fetch_subset('train')
test = fetch_subset('test')

vec = TfidfVectorizer(min_df=3, stop_words='english', ngram_range=(1, 2))
svd = TruncatedSVD(n_components=100, n_iter=7, random_state=42)
lsa = make_pipeline(vec, svd)

clf = SVC(C=150, gamma=2e-2, probability=True)
pipe = make_pipeline(lsa, clf)
pipe.fit(twenty_train.data, twenty_train.target)
pipe.score(twenty_test.data, twenty_test.target)

doc = twenty_test.data[0]
print_prediction(doc)

te = TextExplainer(random_state=42)
te.fit(doc, pipe.predict_proba)
#print(te.explain_prediction(target_names=twenty_train.target_names))
#print(eli5.format_as_image(te.explain_weights(target_names=twenty_train.target_names)))

show_html = lambda html: display(HTML(html))
show_html_expl = lambda expl, **kwargs: show_html(
    format_as_html(expl, include_styles=False, **kwargs))
show_html(format_html_styles())

weights = eli5.show_weights(clf,
                            vec=vec,
                            target_names=train['target_names'],
                            horizontal_layout=False)

pred = show_html_expl(explain_prediction(clf,
Ejemplo n.º 20
0
__author__ = 'xead'
# coding: utf-8

from sentiment_classifier import SentimentClassifier
from sklearn.externals import joblib
from eli5.lime import TextExplainer

#clf = SentimentClassifier()

#pred = clf.get_prediction_message("Хороший телефон")
text = 'Хороший был у меня телефон 5 лет назад'

pipe = joblib.load("./pipe6.pkl")
te = TextExplainer(random_state=42)
te.fit(text, pipe.predict_proba)
res = te.show_prediction(target_names=['negative', 'positive'], top=25)

print (res)
Ejemplo n.º 21
0
# + Data
# + Model
# + Target Names
# + Function

# In[499]:

from eli5.lime import TextExplainer

# In[500]:

pipe.predict_proba

# In[501]:

exp = TextExplainer(random_state=42)

# In[502]:

X_test.values[0]

# In[515]:

a = pipelog.predict([input()])
if a == 1:
    print("hate statement")
elif a == 0:
    print("Not hate bro!")

# In[374]:
Ejemplo n.º 22
0
# -*- coding: utf-8 -*-
"""
__title__ = 'eli5'
__author__ = 'JieYuan'
__mtime__ = '2018/8/21'
"""
from eli5.lime import TextExplainer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline

X = [
    "The dimension of the input documents is reduced to 100, and then a kernel SVM is used to classify the documents.",
    "This is what the pipeline returns for a document - it is pretty sure the first message in test data belongs to sci.med:"
]

y = [0, 1]

piplie = make_pipeline(TfidfVectorizer(), LogisticRegression())

te = TextExplainer(random_state=42)
te.fit(X[0], piplie.predict_proba)
te.show_prediction()
te.show_weights()

eli5.show_prediction
            for word in words:
                index = 0
                for word_block in word:
                    if len(average_word_vector) == index:
                        average_word_vector.append(0)
                    average_word_vector[index] += float(word_block)
                    index += 1
            index = 0
            for word_block in average_word_vector:
                average_word_vector[index] /= float(len(words))
                index += 1
            xout.append(average_word_vector)
        return np.array(xout)


vectorizer = V()

for classifier in classifiers:
    print(classifier)
    gnb = classifier()

    pipe = make_pipeline(vectorizer, gnb)
    pipe.fit(x[:testcutoff], y[:testcutoff])

    y_predicted = pipe.predict_proba(x[testcutoff:])
    #print(classification_report(y[testcutoff:], y_predicted, target_names=['known weird', 'less weird']))

    te = TextExplainer(random_state=101, n_samples=500)
    te.fit('Green new deal is the best bro, bring it on', pipe.predict_proba)
    te.show_prediction(target_names=['known weird', 'less weird'])
Ejemplo n.º 24
0
x_train, x_test, y_train, y_test = train_test_split(X,Y, test_size = 0.1, random_state = 40, stratify=Y)



text_model.fit(x_train, y_train)



text_model.score(x_test, y_test)



from IPython.display import display, HTML





import eli5
from eli5.lime import TextExplainer

for idx in x_test.index[190:210]:
  te = TextExplainer(random_state=42)
  te.fit(cleaner(x_test[idx]), text_model.predict_proba, )
  print("Real Class:",  ["Non Toxic" if x == 0 else "Toxic" for x in [df_corpus_final.iloc[idx]['class']]])
  print("Text uncleaned tweet:", df_corpus_final.iloc[idx]['tweet'])
  print("ELI5 Predicted Class:")
  HTML(display((te.show_prediction(target_names=[ 'Non Toxic','Toxic',]))))
  
  import pickle
  pickle.dump(text_model, open('toxic.pickle', 'wb'))
Ejemplo n.º 25
0
 def get_result(self, text):
     te = TextExplainer(random_state=42)
     te.fit(text, self.pipe.predict_proba)
     res = te.show_prediction(target_names=['negative', 'positive'], top=25)
     return res
     
Ejemplo n.º 26
0
# Used in pickle pipeline on TF-IDF
def dummy(token):
    return token


# Load pre-trained ML model
model = pickle.load(open('model.pkl', 'rb'))  # NEEDS TO BE CREATED WITH BOTH FILES IN FOLDER pickle_model_for_webapp

# Create object of class preprocessing to clean data
reading = clean_data.preprocessing.preprocessing(convert_lower=True, use_spell_corrector=True, only_verbs_nouns=False)

# clf: define ML classifier
# vec: define vectorizer
# n_samples: sets the number of random examples to generate from given instance of text (default value 5000)
# use LIME method to train a white box classifier to make the same prediction as the black box one (pipeline)
te = TextExplainer(vec=TfidfVectorizer(ngram_range=(1, 2), preprocessor=dummy, token_pattern='(?u)\\b\\w+\\b'),
                   n_samples=5000, char_based=False, random_state=42)


def one_word_get_prediction_class_name(prediction):
    '''
    Pipeline with XGBoost - translate the prediction class number into words
    :param prediction: the predicted number/class
    :return: the predicted class in natural language
    '''
    # The order of classes in predict_proba: ['hate speech', 'neither', 'offensive language']
    if prediction == 0:
        output = "hate speech"
    elif prediction == 1:
        output = "neither"
    else:
        output = "offensive language"
Ejemplo n.º 27
0
def test_text_explainer_position_dependent():
    text = "foo bar baz egg spam bar baz egg spam ham"

    @_apply_to_list
    def predict_proba(doc):
        tokens = doc.split()
        # 'bar' is only important in the beginning of the document,
        # not in the end
        return [0, 1] if len(tokens) >= 2 and tokens[1] == 'bar' else [1, 0]

    # bag of words model is not powerful enough to explain predict_proba above
    te = TextExplainer(random_state=42, vec=CountVectorizer())
    te.fit(text, predict_proba)
    print(te.metrics_)
    assert te.metrics_['score'] < 0.9
    assert te.metrics_['mean_KL_divergence'] > 0.3

    # position_dependent=True can make it work
    te = TextExplainer(position_dependent=True, random_state=42)
    te.fit(text, predict_proba)
    print(te.metrics_)
    assert te.metrics_['score'] > 0.95
    assert te.metrics_['mean_KL_divergence'] < 0.3

    expl = te.explain_prediction()
    format_as_all(expl, te.clf_)

    # it is also possible to almost make it work using a custom vectorizer
    vec = CountVectorizer(ngram_range=(1, 2))
    te = TextExplainer(vec=vec, random_state=42)
    te.fit(text, predict_proba)
    print(te.metrics_)
    assert te.metrics_['score'] > 0.95
    assert te.metrics_['mean_KL_divergence'] < 0.3

    expl = te.explain_prediction()
    format_as_all(expl, te.clf_)

    # custom vectorizers are not supported when position_dependent is True
    with pytest.raises(ValueError):
        te = TextExplainer(position_dependent=True, vec=HashingVectorizer())
                    a_set = Sentence(p_str)
                    stacked_embeddings.embed(a_set)
                    to_ret = a_set.get_embedding().cpu().detach().numpy(
                    ).reshape(1, -1)
            except:
                print(type(X))
                print(X)
        return to_ret


pipe = joblib.load('saved_card_classification.pkl')

if keras:
    pipe.named_steps['model'].model = load_model('keras_model.h5')

te = TextExplainer(random_state=42, n_samples=3000, position_dependent=False)


def explain_pred(sentence):
    te.fit(sentence, pipe.predict_proba)
    #txt = format_as_text(te.explain_prediction(target_names=["green", "neutral", "red"]))
    t_pred = te.explain_prediction(top=20,
                                   target_names=[
                                       "ANB",
                                       "CAP",
                                       "ECON",
                                       "EDU",
                                       "ENV",
                                       "EX",
                                       "FED",
                                       "HEG",
Ejemplo n.º 29
0
import jieba
seg_list = jieba.cut("看了快一半了才发现是mini的广告", cut_all=False)
list(seg_list)

# ### Example 1

# In[10]:

get_proba(["看 了 快 一半 了 才 发现 是 mini 的 广告"])

# In[11]:

from eli5.lime import TextExplainer

te = TextExplainer(random_state=42, n_samples=5000)
te.fit(" ".join(jieba.cut("看了快一半了才发现是mini的广告", cut_all=False)), get_proba)
te.show_prediction(target_names=["neg", "pos"])

# In[12]:

te.metrics_

# In[13]:

te.samples_[:10]

# #### Character-based Whitebox

# In[14]:
Ejemplo n.º 30
0
    # opcodes_dir = '/home/hwangdz/coreutils/coreutils-8.28/install_m32/bin/md5funcs_ops'
    opcodes_dir = '/home/hwangdz/git/rl-select-div/only-similarity/explanation/%s_ops_info' % bin_name
    output_dir = 'explanation/%s_html' % bin_name
    if not os.path.isdir(output_dir):
        os.mkdir(output_dir)
    for file_name in os.listdir(opcodes_dir):
        # if file_name != 'dump.s':
        #    continue
        if file_name == 'op_distribution':
            continue
        file_path = os.path.join(opcodes_dir, file_name)
        with open(file_path, 'r') as f:
            op_codes = f.read()
            if len(op_codes) < 20:
                continue
            num_ops = len(op_codes.split())
            op_codes = op_codes.replace('\n', ' ')
            opcode_explainer = TextExplainer(random_state=59, sampler=ops_sampler, n_samples=5000)
            #repeat_times = (len(op_codes.split()) / 100) ** 2
            repeat_times = 1
            for _ in range(repeat_times):
                opcode_explainer.fit(op_codes, ss.predict_proba)
            explanation = opcode_explainer.explain_prediction()._repr_html_()
            with open('explanation/%s_html/explanation-%s.html' % (bin_name, file_name), 'w') as ef:
                ef.write(explanation)
                ef.write('num of opcodes: %d\n' % num_ops)
                ef.write('</br>\n')
                ef.write(op_codes)