Ejemplo n.º 1
0
def compute_anchor(text):
    print("Initialize Anchor")
    explainer_anchor_cnn = AnchorText(nlp, predict_cnn)
    explanation_anchor_cnn = explainer_anchor_cnn.explain(text, desired_label=1, threshold=0.75,
                                                              use_similarity_proba=False, use_unk=True, sample_proba=0.5, beam_size=1, tau=0.15)
    del explainer_anchor_cnn
    return explanation_anchor_cnn
Ejemplo n.º 2
0
def test_anchor_text(predict_type, present, use_proba, use_unk, threshold):
    # load data and create train and test sets
    data, labels = movie_sentiment()
    train, test, train_labels, test_labels = train_test_split(data, labels, test_size=.2, random_state=0)
    train_labels = np.array(train_labels)

    # apply CountVectorizer
    vectorizer = CountVectorizer(min_df=1)
    vectorizer.fit(train)

    # train Logistic Regression model
    clf = LogisticRegression()
    clf.fit(vectorizer.transform(train), train_labels)

    # define predict function
    if predict_type == 'proba':
        predict_fn = lambda x: clf.predict_proba(vectorizer.transform(x))
    elif predict_type == 'class':
        predict_fn = lambda x: clf.predict(vectorizer.transform(x))

    # test explainer initialization
    explainer = AnchorText(nlp, predict_fn)
    assert explainer.predict_fn(['book']).shape == (1,)

    # test sampling function
    text = 'This is a good book .'
    num_samples = 100
    sample_prob_unk = .5
    top_n = 500
    words, positions, sample_fn = explainer.get_sample_fn(text, use_proba=use_proba, use_unk=use_unk)
    raw_data, data, labels = sample_fn(present, num_samples, sample_prob_unk=sample_prob_unk, top_n=top_n)

    if use_proba:  # check that words in present are in the proposed anchor
        assert len(present) * data.shape[0] == data[:, :-1].sum()  # exclude '.'

    if use_unk:
        # get list of unique words
        all_words = []
        for i in range(raw_data.shape[0]):
            all_words.append(raw_data[i][0].split())
        all_words = [word for word_list in all_words for word in word_list]

        # unique words = words in text + UNK
        assert len(np.unique(all_words)) == len(text.split()) + 1

        # check nb of UNKs
        assert data.shape[0] * data.shape[1] - data.sum() == Counter(all_words)['UNK']

    # test explanation
    explanation = explainer.explain(text, threshold=threshold, use_proba=use_proba, use_unk=use_unk)
    assert explanation['precision'] >= threshold
    # check if sampled sentences are not cut short
    keys = ['covered', 'covered_true', 'covered_false']
    for i in range(len(explanation['raw']['feature'])):
        example_dict = explanation['raw']['examples'][i]
        for k in keys:
            for example in example_dict[k]:
                assert example[0][-1] in ['.', 'K']
Ejemplo n.º 3
0
def test_anchor_text(lr_classifier, text, n_punctuation_marks, n_unique_words,
                     predict_type, anchor, use_similarity_proba, use_unk,
                     threshold):
    # test parameters
    num_samples = 100
    sample_proba = .5
    top_n = 500
    temperature = 1.
    n_covered_ex = 5  # number of examples where the anchor applies to be returned

    # fit and initialise predictor
    clf, preprocessor = lr_classifier
    predictor = predict_fcn(predict_type, clf, preproc=preprocessor)

    # test explainer initialization
    explainer = AnchorText(nlp, predictor)
    assert explainer.predictor(['book']).shape == (1, )

    # setup explainer
    perturb_opts = {
        'use_similarity_proba': use_similarity_proba,
        'sample_proba': sample_proba,
        'temperature': temperature,
    }
    explainer.n_covered_ex = n_covered_ex
    explainer.set_words_and_pos(text)
    explainer.set_sampler_perturbation(use_unk, perturb_opts, top_n)
    explainer.set_data_type(use_unk)
    if predict_type == 'proba':
        label = np.argmax(predictor([text])[0])
    elif predict_type == 'class':
        label = predictor([text])[0]
    explainer.instance_label = label

    assert isinstance(explainer.dtype, str)
    assert len(explainer.punctuation) == n_punctuation_marks
    assert len(explainer.words) == len(explainer.positions)

    # test sampler
    cov_true, cov_false, labels, data, coverage, _ = explainer.sampler(
        (0, anchor), num_samples)
    if not anchor:
        assert coverage == -1
    if use_similarity_proba and len(
            anchor
    ) > 0:  # check that words in present are in the proposed anchor
        assert len(anchor) * data.shape[0] == data[:, anchor].sum()

    if use_unk:
        # get list of unique words
        all_words = explainer.words
        # unique words = words in text + UNK
        assert len(np.unique(all_words)) == n_unique_words

    # test explanation
    explanation = explainer.explain(
        text,
        use_unk=use_unk,
        threshold=threshold,
        use_similarity_proba=use_similarity_proba,
    )
    assert explanation.precision >= threshold
    assert explanation.raw['prediction'].item() == label
    assert explanation.meta.keys() == DEFAULT_META_ANCHOR.keys()
    assert explanation.data.keys() == DEFAULT_DATA_ANCHOR.keys()

    # check if sampled sentences are not cut short
    keys = ['covered_true', 'covered_false']
    for i in range(len(explanation.raw['feature'])):
        example_dict = explanation.raw['examples'][i]
        for k in keys:
            for example in example_dict[k]:
                # check that we have perturbed the sentences
                if use_unk:
                    assert 'UNK' in example or example.replace(
                        ' ', '') == text.replace(' ', '')
                else:
                    assert 'UNK' not in example
                assert example[-1] in ['.', 'K']
Ejemplo n.º 4
0
val_labels = np.array(val_labels)

# define and  train an cnn model
vectorizer = CountVectorizer(min_df=1)
clf = LogisticRegression(solver='liblinear')
pipeline = Pipeline([('preprocess', vectorizer), ('clf', clf)])

print('Training ...')
pipeline.fit(train, train_labels)
print('Training done!')

preds_train = pipeline.predict(train)
preds_val = pipeline.predict(val)
preds_test = pipeline.predict(test)
print('Train accuracy', accuracy_score(train_labels, preds_train))
print('Validation accuracy', accuracy_score(val_labels, preds_val))
print('Test accuracy', accuracy_score(test_labels, preds_test))

print("Saving Model to model.joblib")
joblib.dump(pipeline, "model.joblib")

print("Creating Anchor Text explainer")
spacy_language_model = 'en_core_web_md'
spacy_model(model=spacy_language_model)
nlp = spacy.load(spacy_language_model)
anchors_text = AnchorText(nlp=nlp, predictor=lambda x: pipeline.predict(x))

# Test explanations locally
expl = anchors_text.explain("the actors are very bad")
print(expl)
Ejemplo n.º 5
0
def atext_explainer(lr_classifier, english_spacy_model, movie_sentiment_data):
    predictor = predict_fcn(predict_type='class',
                            clf=lr_classifier,
                            preproc=movie_sentiment_data['preprocessor'])
    atext = AnchorText(nlp=english_spacy_model, predictor=predictor)
    return atext
Ejemplo n.º 6
0
# define and  train an cnn model
vectorizer = CountVectorizer(min_df=1)
clf = LogisticRegression(solver='liblinear')
pipeline = Pipeline([('preprocess', vectorizer), ('clf', clf)])

print('Training ...')
pipeline.fit(train, train_labels)
print('Training done!')

preds_train = pipeline.predict(train)
preds_val = pipeline.predict(val)
preds_test = pipeline.predict(test)
print('Train accuracy', accuracy_score(train_labels, preds_train))
print('Validation accuracy', accuracy_score(val_labels, preds_val))
print('Test accuracy', accuracy_score(test_labels, preds_test))

print("Saving Model to model.joblib")
joblib.dump(pipeline, "model.joblib")

print("Creating Anchor Text explainer")
spacy_language_model = 'en_core_web_md'
spacy_model(model=spacy_language_model)
nlp = spacy.load(spacy_language_model)
predict_fn = lambda x: pipeline.predict(x)
anchors_text = AnchorText(nlp, predict_fn)

# Test explanations locally
expl = anchors_text.explain("the actors are very bad")
print(expl)