コード例 #1
0
def movie_sentiment_dataset():
    """
    Load and prepare movie sentiment data.
    """

    movies = fetch_movie_sentiment()
    data = movies.data
    labels = movies.target
    train, test, train_labels, test_labels = train_test_split(data,
                                                              labels,
                                                              test_size=.2,
                                                              random_state=0)
    train_labels = np.array(train_labels)
    vectorizer = CountVectorizer(min_df=1)
    vectorizer.fit(train)

    return {
        'X_train': train,
        'y_train': train_labels,
        'X_test': test,
        'y_test': test_labels,
        'preprocessor': vectorizer,
        'metadata': {
            'name': 'movie_sentiment'
        },
    }
コード例 #2
0
def test_anchor_text():
    skmodel = SKLearnServer(MOVIE_MODEL_URI)
    skmodel.load()
    movies = fetch_movie_sentiment()
    anchor_text = AnchorText(skmodel.predict, None)

    np.random.seed(0)
    explanation = anchor_text.explain(movies.data[4:5])
    exp_json = json.loads(explanation.to_json())
    assert exp_json["meta"]["name"] == "AnchorText"
コード例 #3
0
ファイル: test_anchor_text.py プロジェクト: zyxue/kfserving
def test_anchor_text():
    os.environ.clear()
    skmodel = SKLearnModel("adult", MOVIE_MODEL_URI)
    skmodel.load()
    predictor = Predictor(skmodel)
    anchor_text = AnchorText(predictor.predict_fn, None)
    movies = fetch_movie_sentiment()
    np.random.seed(0)
    explanation = anchor_text.explain(movies.data[4:5])
    exp_json = json.loads(explanation.to_json())
    print(exp_json["data"]["anchor"])
コード例 #4
0
def test_anchor_text():
    os.environ.clear()
    skmodel = SKLearnServer(MOVIE_MODEL_URI)
    skmodel.load()
    movies = fetch_movie_sentiment()
    anchor_text = AnchorText(skmodel.predict, None)

    np.random.seed(0)
    explanation = anchor_text.explain(movies.data[4:5])
    exp_json = json.loads(explanation.to_json())
    print(exp_json["data"]["anchor"])
コード例 #5
0
ファイル: test_datasets.py プロジェクト: gangh/alibi
def test_movie_sentiment(return_X_y):
    data = fetch_movie_sentiment(return_X_y=return_X_y)
    if return_X_y:
        assert len(data) == 2
        X, y = data
    else:
        assert len(data) == 3
        X = data.data
        y = data.target

    assert len(X) == len(y)
    assert len(set(y)) == MOVIE_CLASSES
コード例 #6
0
def prepare_data(test_size):
    # load data
    X, y = fetch_movie_sentiment(return_X_y=True)

    # prepare data
    data = pd.DataFrame()
    data['text'] = X
    data['labels'] = y

    if math.isclose(test_size, 0.0):
        return data, None
    else:
        train, test = train_test_split(data, test_size=test_size)
        return train, test
コード例 #7
0
def test_movie_sentiment(return_X_y):
    try:
        data = fetch_movie_sentiment(return_X_y=return_X_y)
    except RequestException:
        pytest.skip('Movie sentiment dataset URL down')
    if return_X_y:
        assert len(data) == 2
        X, y = data
    else:
        assert len(data) == 3
        X = data.data
        y = data.target

    assert len(X) == len(y)
    assert len(set(y)) == MOVIE_CLASSES
コード例 #8
0
ファイル: train.py プロジェクト: pradithya/kfserving
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from alibi.datasets import fetch_movie_sentiment
from sklearn.pipeline import Pipeline
import joblib
from alibi.explainers import AnchorText
import spacy
from alibi.utils.download import spacy_model

# load data
movies = fetch_movie_sentiment()
movies.keys()
data = movies.data
labels = movies.target
target_names = movies.target_names

# define train and test set
np.random.seed(0)
train, test, train_labels, test_labels = train_test_split(data,
                                                          labels,
                                                          test_size=.2,
                                                          random_state=42)
train, val, train_labels, val_labels = train_test_split(train,
                                                        train_labels,
                                                        test_size=.1,
                                                        random_state=42)
train_labels = np.array(train_labels)
test_labels = np.array(test_labels)
コード例 #9
0
def test_anchor_text(predict_type, present, use_similarity_proba, use_unk,
                     threshold):
    # load data and create train and test sets
    movies = fetch_movie_sentiment()
    data = movies.data
    labels = movies.target
    train, test, train_labels, test_labels = train_test_split(data,
                                                              labels,
                                                              test_size=.2,
                                                              random_state=0)
    train_labels = np.array(train_labels)

    # apply CountVectorizer
    vectorizer = CountVectorizer(min_df=1)
    vectorizer.fit(train)

    # train Logistic Regression model
    clf = LogisticRegression()
    clf.fit(vectorizer.transform(train), train_labels)

    # define predict function
    if predict_type == 'proba':
        predict_fn = lambda x: clf.predict_proba(vectorizer.transform(x))
    elif predict_type == 'class':
        predict_fn = lambda x: clf.predict(vectorizer.transform(x))

    # test explainer initialization
    explainer = AnchorText(nlp, predict_fn)
    assert explainer.predict_fn(['book']).shape == (1, )

    # test sampling function
    text = 'This is a good book .'
    num_samples = 100
    sample_proba = .5
    top_n = 500
    words, positions, sample_fn = explainer.get_sample_fn(
        text,
        use_similarity_proba=use_similarity_proba,
        use_unk=use_unk,
        sample_proba=sample_proba,
        top_n=top_n)
    raw_data, data, labels = sample_fn(present, num_samples)

    if use_similarity_proba and len(
            present
    ) > 0:  # check that words in present are in the proposed anchor
        assert len(present) * data.shape[0] == data[:, present].sum()

    if use_unk:
        # get list of unique words
        all_words = []
        for i in range(raw_data.shape[0]):
            all_words.append(raw_data[i][0].split())
        all_words = [word for word_list in all_words for word in word_list]

        # unique words = words in text + UNK
        assert len(np.unique(all_words)) == len(text.split()) + 1

        # check nb of UNKs
        assert data.shape[0] * data.shape[1] - data.sum() == Counter(
            all_words)['UNK']

    # test explanation
    explanation = explainer.explain(text,
                                    threshold=threshold,
                                    use_proba=use_similarity_proba,
                                    use_unk=use_unk)
    assert explanation['precision'] >= threshold
    # check if sampled sentences are not cut short
    keys = ['covered', 'covered_true', 'covered_false']
    for i in range(len(explanation['raw']['feature'])):
        example_dict = explanation['raw']['examples'][i]
        for k in keys:
            for example in example_dict[k]:
                assert example[0][-1] in ['.', 'K']