def compute_anchor(text): print("Initialize Anchor") explainer_anchor_cnn = AnchorText(nlp, predict_cnn) explanation_anchor_cnn = explainer_anchor_cnn.explain(text, desired_label=1, threshold=0.75, use_similarity_proba=False, use_unk=True, sample_proba=0.5, beam_size=1, tau=0.15) del explainer_anchor_cnn return explanation_anchor_cnn
def test_anchor_text(predict_type, present, use_proba, use_unk, threshold): # load data and create train and test sets data, labels = movie_sentiment() train, test, train_labels, test_labels = train_test_split(data, labels, test_size=.2, random_state=0) train_labels = np.array(train_labels) # apply CountVectorizer vectorizer = CountVectorizer(min_df=1) vectorizer.fit(train) # train Logistic Regression model clf = LogisticRegression() clf.fit(vectorizer.transform(train), train_labels) # define predict function if predict_type == 'proba': predict_fn = lambda x: clf.predict_proba(vectorizer.transform(x)) elif predict_type == 'class': predict_fn = lambda x: clf.predict(vectorizer.transform(x)) # test explainer initialization explainer = AnchorText(nlp, predict_fn) assert explainer.predict_fn(['book']).shape == (1,) # test sampling function text = 'This is a good book .' num_samples = 100 sample_prob_unk = .5 top_n = 500 words, positions, sample_fn = explainer.get_sample_fn(text, use_proba=use_proba, use_unk=use_unk) raw_data, data, labels = sample_fn(present, num_samples, sample_prob_unk=sample_prob_unk, top_n=top_n) if use_proba: # check that words in present are in the proposed anchor assert len(present) * data.shape[0] == data[:, :-1].sum() # exclude '.' if use_unk: # get list of unique words all_words = [] for i in range(raw_data.shape[0]): all_words.append(raw_data[i][0].split()) all_words = [word for word_list in all_words for word in word_list] # unique words = words in text + UNK assert len(np.unique(all_words)) == len(text.split()) + 1 # check nb of UNKs assert data.shape[0] * data.shape[1] - data.sum() == Counter(all_words)['UNK'] # test explanation explanation = explainer.explain(text, threshold=threshold, use_proba=use_proba, use_unk=use_unk) assert explanation['precision'] >= threshold # check if sampled sentences are not cut short keys = ['covered', 'covered_true', 'covered_false'] for i in range(len(explanation['raw']['feature'])): example_dict = explanation['raw']['examples'][i] for k in keys: for example in example_dict[k]: assert example[0][-1] in ['.', 'K']
def test_anchor_text(lr_classifier, text, n_punctuation_marks, n_unique_words, predict_type, anchor, use_similarity_proba, use_unk, threshold): # test parameters num_samples = 100 sample_proba = .5 top_n = 500 temperature = 1. n_covered_ex = 5 # number of examples where the anchor applies to be returned # fit and initialise predictor clf, preprocessor = lr_classifier predictor = predict_fcn(predict_type, clf, preproc=preprocessor) # test explainer initialization explainer = AnchorText(nlp, predictor) assert explainer.predictor(['book']).shape == (1, ) # setup explainer perturb_opts = { 'use_similarity_proba': use_similarity_proba, 'sample_proba': sample_proba, 'temperature': temperature, } explainer.n_covered_ex = n_covered_ex explainer.set_words_and_pos(text) explainer.set_sampler_perturbation(use_unk, perturb_opts, top_n) explainer.set_data_type(use_unk) if predict_type == 'proba': label = np.argmax(predictor([text])[0]) elif predict_type == 'class': label = predictor([text])[0] explainer.instance_label = label assert isinstance(explainer.dtype, str) assert len(explainer.punctuation) == n_punctuation_marks assert len(explainer.words) == len(explainer.positions) # test sampler cov_true, cov_false, labels, data, coverage, _ = explainer.sampler( (0, anchor), num_samples) if not anchor: assert coverage == -1 if use_similarity_proba and len( anchor ) > 0: # check that words in present are in the proposed anchor assert len(anchor) * data.shape[0] == data[:, anchor].sum() if use_unk: # get list of unique words all_words = explainer.words # unique words = words in text + UNK assert len(np.unique(all_words)) == n_unique_words # test explanation explanation = explainer.explain( text, use_unk=use_unk, threshold=threshold, use_similarity_proba=use_similarity_proba, ) assert explanation.precision >= threshold assert explanation.raw['prediction'].item() == label assert explanation.meta.keys() == DEFAULT_META_ANCHOR.keys() assert explanation.data.keys() == DEFAULT_DATA_ANCHOR.keys() # check if sampled sentences are not cut short keys = ['covered_true', 'covered_false'] for i in range(len(explanation.raw['feature'])): example_dict = explanation.raw['examples'][i] for k in keys: for example in example_dict[k]: # check that we have perturbed the sentences if use_unk: assert 'UNK' in example or example.replace( ' ', '') == text.replace(' ', '') else: assert 'UNK' not in example assert example[-1] in ['.', 'K']
val_labels = np.array(val_labels) # define and train an cnn model vectorizer = CountVectorizer(min_df=1) clf = LogisticRegression(solver='liblinear') pipeline = Pipeline([('preprocess', vectorizer), ('clf', clf)]) print('Training ...') pipeline.fit(train, train_labels) print('Training done!') preds_train = pipeline.predict(train) preds_val = pipeline.predict(val) preds_test = pipeline.predict(test) print('Train accuracy', accuracy_score(train_labels, preds_train)) print('Validation accuracy', accuracy_score(val_labels, preds_val)) print('Test accuracy', accuracy_score(test_labels, preds_test)) print("Saving Model to model.joblib") joblib.dump(pipeline, "model.joblib") print("Creating Anchor Text explainer") spacy_language_model = 'en_core_web_md' spacy_model(model=spacy_language_model) nlp = spacy.load(spacy_language_model) anchors_text = AnchorText(nlp=nlp, predictor=lambda x: pipeline.predict(x)) # Test explanations locally expl = anchors_text.explain("the actors are very bad") print(expl)
def atext_explainer(lr_classifier, english_spacy_model, movie_sentiment_data): predictor = predict_fcn(predict_type='class', clf=lr_classifier, preproc=movie_sentiment_data['preprocessor']) atext = AnchorText(nlp=english_spacy_model, predictor=predictor) return atext
# define and train an cnn model vectorizer = CountVectorizer(min_df=1) clf = LogisticRegression(solver='liblinear') pipeline = Pipeline([('preprocess', vectorizer), ('clf', clf)]) print('Training ...') pipeline.fit(train, train_labels) print('Training done!') preds_train = pipeline.predict(train) preds_val = pipeline.predict(val) preds_test = pipeline.predict(test) print('Train accuracy', accuracy_score(train_labels, preds_train)) print('Validation accuracy', accuracy_score(val_labels, preds_val)) print('Test accuracy', accuracy_score(test_labels, preds_test)) print("Saving Model to model.joblib") joblib.dump(pipeline, "model.joblib") print("Creating Anchor Text explainer") spacy_language_model = 'en_core_web_md' spacy_model(model=spacy_language_model) nlp = spacy.load(spacy_language_model) predict_fn = lambda x: pipeline.predict(x) anchors_text = AnchorText(nlp, predict_fn) # Test explanations locally expl = anchors_text.explain("the actors are very bad") print(expl)