Beispiel #1
0
def test_vanilla():
    X = ["One", "One only", "Two nothing else", "Two and three"]
    Y = np.array([0, 0, 1, 1])

    model = Pipeline([('vec', KerasVectorizer()),
                      ('clf', BiLSTMClassifier(nb_epochs=10))])
    model.fit(X, Y)
    assert model.score(X, Y) > 0.6
Beispiel #2
0
def test_predict_proba():
    X = ["One", "One only", "Two nothing else", "Two and three"]
    Y = np.array([0, 0, 1, 1])

    model = Pipeline([('vec', KerasVectorizer()), ('clf', BiLSTMClassifier())])
    model.fit(X, Y)
    Y_pred_prob = model.predict_proba(X)
    assert sum(Y_pred_prob >= 0) == Y.shape[0]
    assert sum(Y_pred_prob <= 1) == Y.shape[0]
Beispiel #3
0
def test_threshold():
    X = ["One", "One only", "Two nothing else", "Two and three"]
    Y = np.array([0, 0, 1, 1])

    model = Pipeline([('vec', KerasVectorizer()),
                      ('clf', BiLSTMClassifier(threshold=0.1))])
    model.fit(X, Y)
    Y_pred_expected = model.predict_proba(X) > 0.1
    Y_pred = model.predict(X)
    assert np.array_equal(Y_pred_expected, Y_pred)
Beispiel #4
0
def test_multilabel():
    X = [
        "One and two", "One only", "Three and four, nothing else",
        "Two nothing else", "Two and three"
    ]
    Y = np.array([[1, 1, 0, 0], [1, 0, 0, 0], [0, 0, 1, 1], [0, 1, 0, 0],
                  [0, 1, 1, 0]])
    model = Pipeline([('vec', KerasVectorizer()),
                      ('clf', BiLSTMClassifier(multilabel=True))])
    model.fit(X, Y)
    assert model.score(X, Y) > 0.4
    assert model.predict(X).shape == (5, 4)
Beispiel #5
0
def test_early_stopping():
    X = ["One", "One only", "Two nothing else", "Two and three"]
    Y = np.array([0, 0, 1, 1])

    model = Pipeline([('vec', KerasVectorizer()),
                      ('clf',
                       BiLSTMClassifier(early_stopping=True,
                                        nb_epochs=10000))])
    # if early_stopping is not working it will take
    # a lot of time to finish running this test
    model.fit(X, Y)
    assert model.score(X, Y) > 0.6
Beispiel #6
0
def create_model(approach, parameters=None):
    if approach == "tfidf-svm":
        model = Pipeline([
            (
                "tfidf",
                TfidfVectorizer(
                    stop_words="english",
                    max_df=0.95,
                    min_df=0.0,
                    ngram_range=(1, 1),
                ),
            ),
            ("svm", OneVsRestClassifier(SVC(kernel="linear",
                                            probability=True))),
        ])
    elif approach == "tfidf-transformers-svm":
        model = TfidfTransformersSVM()
    elif approach == "bert-svm":
        model = Pipeline([
            ("bert", BertVectorizer(pretrained="bert")),
            ("svm", OneVsRestClassifier(SVC(kernel="linear",
                                            probability=True))),
        ])
    elif approach == "scibert-svm":
        model = Pipeline([
            ("scibert", BertVectorizer(pretrained="scibert")),
            ("svm", OneVsRestClassifier(SVC(kernel="linear",
                                            probability=True))),
        ])
    elif approach == "spacy-textclassifier":
        model = SpacyClassifier()
    elif approach == "bert":
        model = BertClassifier()
    elif approach == "scibert":
        model = BertClassifier(pretrained="scibert")
    elif approach == "classifierchain-tfidf-svm":
        model = Pipeline([
            (
                "tfidf",
                TfidfVectorizer(
                    stop_words="english",
                    max_df=0.95,
                    min_df=0.0,
                    ngram_range=(1, 1),
                ),
            ),
            (
                "svm",
                ClassifierChain(
                    classifier=SVC(kernel="linear", probability=True)),
            ),
        ])
    elif approach == "labelpowerset-tfidf-svm":
        model = Pipeline([
            (
                "tfidf",
                TfidfVectorizer(
                    stop_words="english",
                    max_df=0.95,
                    min_df=0.0,
                    ngram_range=(1, 1),
                ),
            ),
            ("svm", LabelPowerset(SVC(kernel="linear", probability=True))),
        ])
    elif approach == "binaryrelevance-tfidf-svm":
        # same as OneVsRestClassifier
        model = Pipeline([
            (
                "tfidf",
                TfidfVectorizer(
                    stop_words="english",
                    max_df=0.95,
                    min_df=0.0,
                    ngram_range=(1, 1),
                ),
            ),
            (
                "svm",
                BinaryRelevance(
                    classifier=SVC(kernel="linear", probability=True)),
            ),
        ])
    elif approach == "binaryrelevance-tfidf-knn":
        model = Pipeline([
            (
                "tfidf",
                TfidfVectorizer(
                    stop_words="english",
                    max_df=0.95,
                    min_df=0.0,
                    ngram_range=(1, 1),
                ),
            ),
            ("knn", BinaryRelevance(classifier=KNeighborsClassifier)),
        ])
    elif approach == "hashing_vectorizer-svm":
        model = Pipeline([
            ("hashing_vectorizer", HashingVectorizer()),
            ("svm",
             OneVsRestClassifier(SGDClassifier(loss="hinge", penalty="l2"))),
        ])
    elif approach == "hashing_vectorizer-nb":
        model = Pipeline([
            (
                "hashing_vectorizer",
                HashingVectorizer(binary=True, n_features=2**18),
            ),
            ("nb", OneVsRestClassifier(MultinomialNB())),
        ])
    elif approach == "tfidf-sgd":
        model = Pipeline([
            (
                "tfidf",
                TfidfVectorizer(stop_words="english",
                                max_df=0.95,
                                min_df=5,
                                ngram_range=(1, 1)),
            ),
            ("svm",
             OneVsRestClassifier(SGDClassifier(loss="hinge", penalty="l2"))),
        ])
    elif approach == "cnn":
        model = Pipeline([
            ("vec", KerasVectorizer(vocab_size=5_000)),
            (
                "cnn",
                CNNClassifier(
                    learning_rate=0.01,
                    dropout=0.1,
                    nb_epochs=20,
                    nb_layers=4,
                    multilabel=True,
                ),
            ),
        ])
    elif approach == "bilstm":
        model = Pipeline([
            ("vec", KerasVectorizer(vocab_size=5_000, sequence_length=678)),
            (
                "bilstm",
                BiLSTMClassifier(learning_rate=0.01,
                                 dropout=0.1,
                                 nb_epochs=20,
                                 multilabel=True),
            ),
        ])
    elif approach == "doc2vec-sgd":
        model = Pipeline([
            ("vec", Doc2VecVectorizer()),
            (
                "sgd",
                OneVsRestClassifier(SGDClassifier(penalty="l2", alpha=1e-8),
                                    n_jobs=-1),
            ),
        ])
    elif approach == "doc2vec-tfidf-sgd":
        model = Pipeline([
            (
                "vec",
                FeatureUnion([
                    (
                        "doc2vec",
                        Pipeline([
                            ("doc2vec_unscaled", Doc2VecVectorizer()),
                            ("scale_doc2vec", Normalizer()),
                        ]),
                    ),
                    (
                        "tfidf",
                        Pipeline([
                            (
                                "tfidf_unscaled",
                                TfidfVectorizer(
                                    min_df=5,
                                    stop_words="english",
                                    ngram_range=(1, 2),
                                ),
                            ),
                            ("scale_tfidf", Normalizer()),
                        ]),
                    ),
                ]),
            ),
            (
                "sgd",
                OneVsRestClassifier(SGDClassifier(penalty="l2", alpha=1e-6),
                                    n_jobs=-1),
            ),
        ])
    elif approach == "sent2vec-sgd":
        model = Pipeline([
            ("vec", Sent2VecVectorizer(pretrained="biosent2vec")),
            (
                "sgd",
                OneVsRestClassifier(SGDClassifier(penalty="l2", alpha=1e-8),
                                    n_jobs=-1),
            ),
        ])
    elif approach == "sent2vec-tfidf-sgd":
        model = Pipeline([
            (
                "vec",
                FeatureUnion([
                    (
                        "sent2vec",
                        Pipeline([
                            (
                                "sent2vec_unscaled",
                                Sent2VecVectorizer(pretrained="biosent2vec"),
                            ),
                            ("scale_sent2vec", Normalizer()),
                        ]),
                    ),
                    (
                        "tfidf",
                        Pipeline([
                            (
                                "tfidf_unscaled",
                                TfidfVectorizer(
                                    min_df=5,
                                    stop_words="english",
                                    ngram_range=(1, 2),
                                ),
                            ),
                            ("scale_tfidf", Normalizer()),
                        ]),
                    ),
                ]),
            ),
            (
                "sgd",
                OneVsRestClassifier(SGDClassifier(penalty="l2", alpha=1e-8),
                                    n_jobs=-1),
            ),
        ])
    elif approach == "tfidf-adaboost":
        model = Pipeline([
            (
                "tfidf",
                TfidfVectorizer(min_df=5,
                                stop_words="english",
                                ngram_range=(1, 2)),
            ),
            (
                "adaboost",
                OneVsRestClassifier(
                    AdaBoostClassifier(DecisionTreeClassifier())),
            ),
        ])
    elif approach == "tfidf-gboost":
        model = Pipeline([
            (
                "tfidf",
                TfidfVectorizer(min_df=5,
                                stop_words="english",
                                ngram_range=(1, 2)),
            ),
            ("gboost", OneVsRestClassifier(GradientBoostingClassifier())),
        ])
    elif approach == "tfidf+onehot_team-svm":
        model = Pipeline([
            (
                "vectorizer",
                FeatureUnion([
                    (
                        "text_features",
                        Pipeline([
                            (
                                "selector",
                                FunctionTransformer(lambda x: x["text"]),
                            ),
                            (
                                "tfidf",
                                TfidfVectorizer(
                                    min_df=5,
                                    ngram_range=(1, 2),
                                    stop_words="english",
                                ),
                            ),
                        ]),
                    ),
                    (
                        "team_features",
                        Pipeline([
                            (
                                "selector",
                                FunctionTransformer(lambda x: x[["Team"]]),
                            ),
                            (
                                "one hot",
                                OneHotEncoder(handle_unknown="ignore"),
                            ),
                        ]),
                    ),
                ]),
            ),
            (
                "svm",
                OneVsRestClassifier(
                    SVC(class_weight="balanced", kernel="linear")),
            ),
        ])
    elif approach == "tfidf+onehot_scheme-svm":
        model = Pipeline([
            (
                "vectorizer",
                FeatureUnion([
                    (
                        "text_features",
                        Pipeline([
                            (
                                "selector",
                                FunctionTransformer(lambda x: x["text"]),
                            ),
                            (
                                "tfidf",
                                TfidfVectorizer(
                                    min_df=5,
                                    ngram_range=(1, 2),
                                    stop_words="english",
                                ),
                            ),
                        ]),
                    ),
                    (
                        "team_features",
                        Pipeline([
                            (
                                "selector",
                                FunctionTransformer(lambda x: x[["Scheme"]]),
                            ),
                            (
                                "one hot",
                                OneHotEncoder(handle_unknown="ignore"),
                            ),
                        ]),
                    ),
                ]),
            ),
            (
                "svm",
                OneVsRestClassifier(
                    SVC(class_weight="balanced", kernel="linear")),
            ),
        ])
    elif approach == "mesh-tfidf-svm":
        model = MeshTfidfSVM()
    elif approach == "mesh-cnn":
        model = MeshCNN()
    elif approach == "science-ensemble":
        model = ScienceEnsemble()
    elif approach == "mesh-xlinear":
        model = MeshXLinear()
    else:
        raise ApproachNotImplemented
    if parameters:
        params = ast.literal_eval(parameters)
        model.set_params(**params)
    else:
        parameters = {}
    return model
Beispiel #7
0
def test_save_load_attention():
    X = ["One", "One only", "Two nothing else", "Two and three"]
    Y = np.array([0, 0, 1, 1])

    vec = KerasVectorizer()
    X_vec = vec.fit_transform(X)

    model = BiLSTMClassifier(attention=True)
    model.fit(X_vec, Y)

    with tempfile.TemporaryDirectory() as tmp_dir:
        model.save(tmp_dir)
        loaded_model = BiLSTMClassifier()
        loaded_model.load(tmp_dir)
        assert hasattr(loaded_model, 'model')
        assert loaded_model.score(X_vec, Y) > 0.6
Beispiel #8
0
from wellcomeml.ml.bilstm import BiLSTMClassifier
from wellcomeml.ml.keras_vectorizer import KerasVectorizer
from sklearn.pipeline import Pipeline

import numpy as np

X = ["One", "three", "one", "two", "four"]
Y = np.array([1, 0, 1, 0, 0])

bilstm_pipeline = Pipeline([("vec", KerasVectorizer()),
                            ("clf", BiLSTMClassifier())])
bilstm_pipeline.fit(X, Y)
print(bilstm_pipeline.score(X, Y))

X = ["One, three", "one", "two, three"]
Y = np.array([[1, 0, 1], [1, 0, 0], [0, 1, 1]])

bilstm_pipeline = Pipeline([("vec", KerasVectorizer()),
                            ("clf", BiLSTMClassifier(multilabel=True))])
bilstm_pipeline.fit(X, Y)
print(bilstm_pipeline.score(X, Y))