def movie_sentiment_dataset(): """ Load and prepare movie sentiment data. """ movies = fetch_movie_sentiment() data = movies.data labels = movies.target train, test, train_labels, test_labels = train_test_split(data, labels, test_size=.2, random_state=0) train_labels = np.array(train_labels) vectorizer = CountVectorizer(min_df=1) vectorizer.fit(train) return { 'X_train': train, 'y_train': train_labels, 'X_test': test, 'y_test': test_labels, 'preprocessor': vectorizer, 'metadata': { 'name': 'movie_sentiment' }, }
def test_anchor_text(): skmodel = SKLearnServer(MOVIE_MODEL_URI) skmodel.load() movies = fetch_movie_sentiment() anchor_text = AnchorText(skmodel.predict, None) np.random.seed(0) explanation = anchor_text.explain(movies.data[4:5]) exp_json = json.loads(explanation.to_json()) assert exp_json["meta"]["name"] == "AnchorText"
def test_anchor_text(): os.environ.clear() skmodel = SKLearnModel("adult", MOVIE_MODEL_URI) skmodel.load() predictor = Predictor(skmodel) anchor_text = AnchorText(predictor.predict_fn, None) movies = fetch_movie_sentiment() np.random.seed(0) explanation = anchor_text.explain(movies.data[4:5]) exp_json = json.loads(explanation.to_json()) print(exp_json["data"]["anchor"])
def test_anchor_text(): os.environ.clear() skmodel = SKLearnServer(MOVIE_MODEL_URI) skmodel.load() movies = fetch_movie_sentiment() anchor_text = AnchorText(skmodel.predict, None) np.random.seed(0) explanation = anchor_text.explain(movies.data[4:5]) exp_json = json.loads(explanation.to_json()) print(exp_json["data"]["anchor"])
def test_movie_sentiment(return_X_y): data = fetch_movie_sentiment(return_X_y=return_X_y) if return_X_y: assert len(data) == 2 X, y = data else: assert len(data) == 3 X = data.data y = data.target assert len(X) == len(y) assert len(set(y)) == MOVIE_CLASSES
def prepare_data(test_size): # load data X, y = fetch_movie_sentiment(return_X_y=True) # prepare data data = pd.DataFrame() data['text'] = X data['labels'] = y if math.isclose(test_size, 0.0): return data, None else: train, test = train_test_split(data, test_size=test_size) return train, test
def test_movie_sentiment(return_X_y): try: data = fetch_movie_sentiment(return_X_y=return_X_y) except RequestException: pytest.skip('Movie sentiment dataset URL down') if return_X_y: assert len(data) == 2 X, y = data else: assert len(data) == 3 X = data.data y = data.target assert len(X) == len(y) assert len(set(y)) == MOVIE_CLASSES
import numpy as np from sklearn.feature_extraction.text import CountVectorizer from sklearn.linear_model import LogisticRegression from sklearn.metrics import accuracy_score from sklearn.model_selection import train_test_split from alibi.datasets import fetch_movie_sentiment from sklearn.pipeline import Pipeline import joblib from alibi.explainers import AnchorText import spacy from alibi.utils.download import spacy_model # load data movies = fetch_movie_sentiment() movies.keys() data = movies.data labels = movies.target target_names = movies.target_names # define train and test set np.random.seed(0) train, test, train_labels, test_labels = train_test_split(data, labels, test_size=.2, random_state=42) train, val, train_labels, val_labels = train_test_split(train, train_labels, test_size=.1, random_state=42) train_labels = np.array(train_labels) test_labels = np.array(test_labels)
def test_anchor_text(predict_type, present, use_similarity_proba, use_unk, threshold): # load data and create train and test sets movies = fetch_movie_sentiment() data = movies.data labels = movies.target train, test, train_labels, test_labels = train_test_split(data, labels, test_size=.2, random_state=0) train_labels = np.array(train_labels) # apply CountVectorizer vectorizer = CountVectorizer(min_df=1) vectorizer.fit(train) # train Logistic Regression model clf = LogisticRegression() clf.fit(vectorizer.transform(train), train_labels) # define predict function if predict_type == 'proba': predict_fn = lambda x: clf.predict_proba(vectorizer.transform(x)) elif predict_type == 'class': predict_fn = lambda x: clf.predict(vectorizer.transform(x)) # test explainer initialization explainer = AnchorText(nlp, predict_fn) assert explainer.predict_fn(['book']).shape == (1, ) # test sampling function text = 'This is a good book .' num_samples = 100 sample_proba = .5 top_n = 500 words, positions, sample_fn = explainer.get_sample_fn( text, use_similarity_proba=use_similarity_proba, use_unk=use_unk, sample_proba=sample_proba, top_n=top_n) raw_data, data, labels = sample_fn(present, num_samples) if use_similarity_proba and len( present ) > 0: # check that words in present are in the proposed anchor assert len(present) * data.shape[0] == data[:, present].sum() if use_unk: # get list of unique words all_words = [] for i in range(raw_data.shape[0]): all_words.append(raw_data[i][0].split()) all_words = [word for word_list in all_words for word in word_list] # unique words = words in text + UNK assert len(np.unique(all_words)) == len(text.split()) + 1 # check nb of UNKs assert data.shape[0] * data.shape[1] - data.sum() == Counter( all_words)['UNK'] # test explanation explanation = explainer.explain(text, threshold=threshold, use_proba=use_similarity_proba, use_unk=use_unk) assert explanation['precision'] >= threshold # check if sampled sentences are not cut short keys = ['covered', 'covered_true', 'covered_false'] for i in range(len(explanation['raw']['feature'])): example_dict = explanation['raw']['examples'][i] for k in keys: for example in example_dict[k]: assert example[0][-1] in ['.', 'K']