Beispiel #1
0
def test_build_model():
    X = ["One and two", "One only", "Two nothing else", "Two and three"]
    Y = np.array([[1, 1, 0, 0], [1, 0, 0, 0], [0, 1, 0, 0], [0, 1, 1, 0]])

    vectorizer = KerasVectorizer()
    X_vec = vectorizer.fit_transform(X)

    batch_size = 2
    model = CNNClassifier(batch_size=batch_size,
                          multilabel=True,
                          learning_rate=1e-2)
    model.fit(X_vec, Y)

    Y_pred = model.predict(X_vec)
    assert Y_pred.shape[1] == 4

    Y = Y[:, :3]
    sequence_length = X_vec.shape[1]
    vocab_size = X_vec.max() + 1
    nb_outputs = Y.shape[1]
    decay_steps = X_vec.shape[0] / batch_size

    model.build_model(sequence_length, vocab_size, nb_outputs, decay_steps)
    model.fit(X_vec, Y)

    Y_pred = model.predict(X_vec)
    assert Y_pred.shape[1] == 3
 def _init_classifier(self):
     self.classifier = CNNClassifier(
         learning_rate=0.01,
         dropout=0.1,
         sparse_y=True,
         nb_epochs=20,
         nb_layers=4,
         multilabel=True,
         threshold=self.threshold,
         batch_size=self.batch_size,
     )
Beispiel #3
0
def test_XY_dataset_sparse_y():
    X = ["One and two", "One only", "Two nothing else", "Two and three"]
    Y = np.array([[1, 1, 0, 0], [1, 0, 0, 0], [0, 1, 0, 0], [0, 1, 1, 0]])
    Y_sparse = csr_matrix(Y)

    vec = KerasVectorizer()
    X_vec = vec.fit_transform(X)

    train_data = tf.data.Dataset.from_tensor_slices((X_vec, Y))
    test_data = tf.data.Dataset.from_tensor_slices((X_vec))
    clf = CNNClassifier(batch_size=2, sparse_y=True, multilabel=True)
    clf.fit(train_data)
    assert clf.score(test_data, Y_sparse) > 0.3
Beispiel #4
0
def test_XY_dataset():
    X = ["One", "One only", "Two nothing else", "Two and three"]
    Y = np.array([0, 0, 1, 1])

    vec = KerasVectorizer()
    X_vec = vec.fit_transform(X)

    data = tf.data.Dataset.from_tensor_slices((X_vec, Y))
    data = data.shuffle(100, seed=42)
    clf = CNNClassifier(batch_size=2)

    clf.fit(data)
    assert clf.score(data, Y) > 0.3
Beispiel #5
0
def test_XY_list():
    X = ["One", "One only", "Two nothing else", "Two and three"]
    Y = [0, 0, 1, 1]

    model = Pipeline([('vec', KerasVectorizer()),
                      ('clf', CNNClassifier(batch_size=2))])
    model.fit(X, Y)
    assert model.score(X, Y) > 0.6
Beispiel #6
0
def test_feature_approach_concat():
    X = ["One", "One only", "Two nothing else", "Two and three"]
    Y = np.array([0, 0, 1, 1])

    model = Pipeline([('vec', KerasVectorizer()),
                      ('clf',
                       CNNClassifier(batch_size=2,
                                     feature_approach="concat"))])
    model.fit(X, Y)
    assert model.score(X, Y) > 0.6
Beispiel #7
0
def test_threshold():
    X = ["One", "One only", "Two nothing else", "Two and three"]
    Y = np.array([0, 0, 1, 1])

    model = Pipeline([('vec', KerasVectorizer()),
                      ('clf', CNNClassifier(batch_size=2, threshold=0.1))])
    model.fit(X, Y)
    Y_pred_expected = model.predict_proba(X) > 0.1
    Y_pred = model.predict(X)
    assert np.array_equal(Y_pred_expected, Y_pred)
Beispiel #8
0
def test_predict_proba():
    X = ["One", "One only", "Two nothing else", "Two and three"]
    Y = np.array([0, 0, 1, 1])

    model = Pipeline([('vec', KerasVectorizer()),
                      ('clf', CNNClassifier(batch_size=2))])
    model.fit(X, Y)
    Y_pred_prob = model.predict_proba(X)
    assert sum(Y_pred_prob >= 0) == Y.shape[0]
    assert sum(Y_pred_prob <= 1) == Y.shape[0]
Beispiel #9
0
def test_attention():
    X = ["One", "One only", "Two nothing else", "Two and three"]
    Y = np.array([0, 0, 1, 1])

    model = Pipeline([('vec', KerasVectorizer()),
                      ('clf',
                       CNNClassifier(batch_size=2,
                                     attention=True,
                                     attention_heads=10))])
    model.fit(X, Y)
    assert model.score(X, Y) > 0.6
Beispiel #10
0
def test_multilabel():
    X = [
        "One and two", "One only", "Three and four, nothing else",
        "Two nothing else", "Two and three"
    ]
    Y = np.array([[1, 1, 0, 0], [1, 0, 0, 0], [0, 0, 1, 1], [0, 1, 0, 0],
                  [0, 1, 1, 0]])
    model = Pipeline([('vec', KerasVectorizer()),
                      ('clf', CNNClassifier(batch_size=2, multilabel=True))])
    model.fit(X, Y)
    assert model.score(X, Y) > 0.4
    assert model.predict(X).shape == (5, 4)
Beispiel #11
0
def test_multilabel_attention():
    X = ["One and two", "One only", "Two nothing else", "Two and three"]
    Y = np.array([[1, 1, 0, 0], [1, 0, 0, 0], [0, 1, 0, 0], [0, 1, 1, 0]])

    model = Pipeline([('vec', KerasVectorizer()),
                      ('clf',
                       CNNClassifier(batch_size=2,
                                     multilabel=True,
                                     attention=True,
                                     feature_approach="multilabel-attention",
                                     learning_rate=1e-2))])
    model.fit(X, Y)
    assert model.score(X, Y) > 0.3
Beispiel #12
0
def test_early_stopping():
    X = ["One", "One only", "Two nothing else", "Two and three"]
    Y = np.array([0, 0, 1, 1])

    model = Pipeline([('vec', KerasVectorizer()),
                      ('clf',
                       CNNClassifier(batch_size=2,
                                     early_stopping=True,
                                     nb_epochs=10000))])
    # if early_stopping is not working it will take
    # a lot of time to finish running this test
    # it will also consume the 4MB of logs in travis
    model.fit(X, Y)
    assert model.score(X, Y) > 0.6
Beispiel #13
0
def create_model(approach, parameters=None):
    if approach == "tfidf-svm":
        model = Pipeline([
            (
                "tfidf",
                TfidfVectorizer(
                    stop_words="english",
                    max_df=0.95,
                    min_df=0.0,
                    ngram_range=(1, 1),
                ),
            ),
            ("svm", OneVsRestClassifier(SVC(kernel="linear",
                                            probability=True))),
        ])
    elif approach == "tfidf-transformers-svm":
        model = TfidfTransformersSVM()
    elif approach == "bert-svm":
        model = Pipeline([
            ("bert", BertVectorizer(pretrained="bert")),
            ("svm", OneVsRestClassifier(SVC(kernel="linear",
                                            probability=True))),
        ])
    elif approach == "scibert-svm":
        model = Pipeline([
            ("scibert", BertVectorizer(pretrained="scibert")),
            ("svm", OneVsRestClassifier(SVC(kernel="linear",
                                            probability=True))),
        ])
    elif approach == "spacy-textclassifier":
        model = SpacyClassifier()
    elif approach == "bert":
        model = BertClassifier()
    elif approach == "scibert":
        model = BertClassifier(pretrained="scibert")
    elif approach == "classifierchain-tfidf-svm":
        model = Pipeline([
            (
                "tfidf",
                TfidfVectorizer(
                    stop_words="english",
                    max_df=0.95,
                    min_df=0.0,
                    ngram_range=(1, 1),
                ),
            ),
            (
                "svm",
                ClassifierChain(
                    classifier=SVC(kernel="linear", probability=True)),
            ),
        ])
    elif approach == "labelpowerset-tfidf-svm":
        model = Pipeline([
            (
                "tfidf",
                TfidfVectorizer(
                    stop_words="english",
                    max_df=0.95,
                    min_df=0.0,
                    ngram_range=(1, 1),
                ),
            ),
            ("svm", LabelPowerset(SVC(kernel="linear", probability=True))),
        ])
    elif approach == "binaryrelevance-tfidf-svm":
        # same as OneVsRestClassifier
        model = Pipeline([
            (
                "tfidf",
                TfidfVectorizer(
                    stop_words="english",
                    max_df=0.95,
                    min_df=0.0,
                    ngram_range=(1, 1),
                ),
            ),
            (
                "svm",
                BinaryRelevance(
                    classifier=SVC(kernel="linear", probability=True)),
            ),
        ])
    elif approach == "binaryrelevance-tfidf-knn":
        model = Pipeline([
            (
                "tfidf",
                TfidfVectorizer(
                    stop_words="english",
                    max_df=0.95,
                    min_df=0.0,
                    ngram_range=(1, 1),
                ),
            ),
            ("knn", BinaryRelevance(classifier=KNeighborsClassifier)),
        ])
    elif approach == "hashing_vectorizer-svm":
        model = Pipeline([
            ("hashing_vectorizer", HashingVectorizer()),
            ("svm",
             OneVsRestClassifier(SGDClassifier(loss="hinge", penalty="l2"))),
        ])
    elif approach == "hashing_vectorizer-nb":
        model = Pipeline([
            (
                "hashing_vectorizer",
                HashingVectorizer(binary=True, n_features=2**18),
            ),
            ("nb", OneVsRestClassifier(MultinomialNB())),
        ])
    elif approach == "tfidf-sgd":
        model = Pipeline([
            (
                "tfidf",
                TfidfVectorizer(stop_words="english",
                                max_df=0.95,
                                min_df=5,
                                ngram_range=(1, 1)),
            ),
            ("svm",
             OneVsRestClassifier(SGDClassifier(loss="hinge", penalty="l2"))),
        ])
    elif approach == "cnn":
        model = Pipeline([
            ("vec", KerasVectorizer(vocab_size=5_000)),
            (
                "cnn",
                CNNClassifier(
                    learning_rate=0.01,
                    dropout=0.1,
                    nb_epochs=20,
                    nb_layers=4,
                    multilabel=True,
                ),
            ),
        ])
    elif approach == "bilstm":
        model = Pipeline([
            ("vec", KerasVectorizer(vocab_size=5_000, sequence_length=678)),
            (
                "bilstm",
                BiLSTMClassifier(learning_rate=0.01,
                                 dropout=0.1,
                                 nb_epochs=20,
                                 multilabel=True),
            ),
        ])
    elif approach == "doc2vec-sgd":
        model = Pipeline([
            ("vec", Doc2VecVectorizer()),
            (
                "sgd",
                OneVsRestClassifier(SGDClassifier(penalty="l2", alpha=1e-8),
                                    n_jobs=-1),
            ),
        ])
    elif approach == "doc2vec-tfidf-sgd":
        model = Pipeline([
            (
                "vec",
                FeatureUnion([
                    (
                        "doc2vec",
                        Pipeline([
                            ("doc2vec_unscaled", Doc2VecVectorizer()),
                            ("scale_doc2vec", Normalizer()),
                        ]),
                    ),
                    (
                        "tfidf",
                        Pipeline([
                            (
                                "tfidf_unscaled",
                                TfidfVectorizer(
                                    min_df=5,
                                    stop_words="english",
                                    ngram_range=(1, 2),
                                ),
                            ),
                            ("scale_tfidf", Normalizer()),
                        ]),
                    ),
                ]),
            ),
            (
                "sgd",
                OneVsRestClassifier(SGDClassifier(penalty="l2", alpha=1e-6),
                                    n_jobs=-1),
            ),
        ])
    elif approach == "sent2vec-sgd":
        model = Pipeline([
            ("vec", Sent2VecVectorizer(pretrained="biosent2vec")),
            (
                "sgd",
                OneVsRestClassifier(SGDClassifier(penalty="l2", alpha=1e-8),
                                    n_jobs=-1),
            ),
        ])
    elif approach == "sent2vec-tfidf-sgd":
        model = Pipeline([
            (
                "vec",
                FeatureUnion([
                    (
                        "sent2vec",
                        Pipeline([
                            (
                                "sent2vec_unscaled",
                                Sent2VecVectorizer(pretrained="biosent2vec"),
                            ),
                            ("scale_sent2vec", Normalizer()),
                        ]),
                    ),
                    (
                        "tfidf",
                        Pipeline([
                            (
                                "tfidf_unscaled",
                                TfidfVectorizer(
                                    min_df=5,
                                    stop_words="english",
                                    ngram_range=(1, 2),
                                ),
                            ),
                            ("scale_tfidf", Normalizer()),
                        ]),
                    ),
                ]),
            ),
            (
                "sgd",
                OneVsRestClassifier(SGDClassifier(penalty="l2", alpha=1e-8),
                                    n_jobs=-1),
            ),
        ])
    elif approach == "tfidf-adaboost":
        model = Pipeline([
            (
                "tfidf",
                TfidfVectorizer(min_df=5,
                                stop_words="english",
                                ngram_range=(1, 2)),
            ),
            (
                "adaboost",
                OneVsRestClassifier(
                    AdaBoostClassifier(DecisionTreeClassifier())),
            ),
        ])
    elif approach == "tfidf-gboost":
        model = Pipeline([
            (
                "tfidf",
                TfidfVectorizer(min_df=5,
                                stop_words="english",
                                ngram_range=(1, 2)),
            ),
            ("gboost", OneVsRestClassifier(GradientBoostingClassifier())),
        ])
    elif approach == "tfidf+onehot_team-svm":
        model = Pipeline([
            (
                "vectorizer",
                FeatureUnion([
                    (
                        "text_features",
                        Pipeline([
                            (
                                "selector",
                                FunctionTransformer(lambda x: x["text"]),
                            ),
                            (
                                "tfidf",
                                TfidfVectorizer(
                                    min_df=5,
                                    ngram_range=(1, 2),
                                    stop_words="english",
                                ),
                            ),
                        ]),
                    ),
                    (
                        "team_features",
                        Pipeline([
                            (
                                "selector",
                                FunctionTransformer(lambda x: x[["Team"]]),
                            ),
                            (
                                "one hot",
                                OneHotEncoder(handle_unknown="ignore"),
                            ),
                        ]),
                    ),
                ]),
            ),
            (
                "svm",
                OneVsRestClassifier(
                    SVC(class_weight="balanced", kernel="linear")),
            ),
        ])
    elif approach == "tfidf+onehot_scheme-svm":
        model = Pipeline([
            (
                "vectorizer",
                FeatureUnion([
                    (
                        "text_features",
                        Pipeline([
                            (
                                "selector",
                                FunctionTransformer(lambda x: x["text"]),
                            ),
                            (
                                "tfidf",
                                TfidfVectorizer(
                                    min_df=5,
                                    ngram_range=(1, 2),
                                    stop_words="english",
                                ),
                            ),
                        ]),
                    ),
                    (
                        "team_features",
                        Pipeline([
                            (
                                "selector",
                                FunctionTransformer(lambda x: x[["Scheme"]]),
                            ),
                            (
                                "one hot",
                                OneHotEncoder(handle_unknown="ignore"),
                            ),
                        ]),
                    ),
                ]),
            ),
            (
                "svm",
                OneVsRestClassifier(
                    SVC(class_weight="balanced", kernel="linear")),
            ),
        ])
    elif approach == "mesh-tfidf-svm":
        model = MeshTfidfSVM()
    elif approach == "mesh-cnn":
        model = MeshCNN()
    elif approach == "science-ensemble":
        model = ScienceEnsemble()
    elif approach == "mesh-xlinear":
        model = MeshXLinear()
    else:
        raise ApproachNotImplemented
    if parameters:
        params = ast.literal_eval(parameters)
        model.set_params(**params)
    else:
        parameters = {}
    return model
Beispiel #14
0
from wellcomeml.ml.cnn import CNNClassifier
from wellcomeml.ml.keras_vectorizer import KerasVectorizer
from sklearn.pipeline import Pipeline

import numpy as np

X = ["One", "three", "one", "two", "four"]
Y = np.array([1, 0, 1, 0, 0])

cnn_pipeline = Pipeline([("vec", KerasVectorizer()), ("clf", CNNClassifier())])
cnn_pipeline.fit(X, Y)
print(cnn_pipeline.score(X, Y))

X = ["One, three", "one", "two, three"]
Y = np.array([[1, 0, 1], [1, 0, 0], [0, 1, 1]])

cnn_pipeline = Pipeline([("vec", KerasVectorizer()),
                         ("clf", CNNClassifier(multilabel=True))])
cnn_pipeline.fit(X, Y)
print(cnn_pipeline.score(X, Y))
Beispiel #15
0
def test_save_load_attention():
    X = ["One", "One only", "Two nothing else", "Two and three"]
    Y = np.array([0, 0, 1, 1])

    vec = KerasVectorizer()
    X_vec = vec.fit_transform(X)

    model = CNNClassifier(batch_size=2, attention=True)
    model.fit(X_vec, Y)

    with tempfile.TemporaryDirectory() as tmp_dir:
        model.save(tmp_dir)
        loaded_model = CNNClassifier()
        loaded_model.load(tmp_dir)
        assert hasattr(loaded_model, 'model')
        assert loaded_model.score(X_vec, Y) > 0.6
Beispiel #16
0
class MeshCNN:
    def __init__(
        self,
        threshold=0.5,
        batch_size=256,
        shuffle=True,
        buffer_size=1000,
        data_cache=None,
        random_seed=42,
    ):
        """
        threshold: float, default 0.5. Probability threshold on top of which a tag should be assigned.
        batch_size: int, default 256. Size of batches used for training and prediction.
        shuffle: bool, default True. Flag on whether to shuffle data before fit.
        buffer_size: int, default 1000. Buffer size used for shuffling or transforming data before fit.
        data_cache: path, default None. Path to use for caching data transformations.
        random_seed: int, default 42. Random seed that controls reproducibility.
        """
        self.threshold = threshold
        self.batch_size = batch_size
        self.shuffle = shuffle
        self.buffer_size = buffer_size
        self.data_cache = data_cache
        self.random_seed = random_seed

    def _yield_data(self, X, vectorizer, Y=None):
        """
        Generator to yield vectorized X and Y data one by one

        X: list of texts
        vectorizer: vectorizer class that implements transform which transforms texts to integers
        Y: 2d numpy array or sparse csr_matrix that represents targets (tags) assigned.

        If Y is missing, for example when called by predict, yield_data yields only X vectorized
        """
        def yield_transformed_data(X_buffer, Y_buffer):
            # TODO: This could move to WellcomeML to enable CNN to receive generators
            Y_den = None
            X_vec = self.vectorizer.transform(X_buffer)
            if Y_buffer:
                # Y_buffer list of np or sparse arrays
                if type(Y_buffer[0]) == np.ndarray:
                    Y_den = np.vstack(Y_buffer)
                else:  # sparse
                    Y_buffer = sp.vstack(Y_buffer)
                    Y_den = np.asarray(Y_buffer.todense())

            for i in range(len(X_buffer)):
                if Y_den is not None:
                    yield X_vec[i], Y_den[i]
                else:
                    yield X_vec[i]

        def data_gen():
            """
            Wrapper on top of yield_transformed_data to get a callable function
            which enables to restart the iterator.

            This function also implements buffering for more efficient transformations
            """
            X_buffer = []
            Y_buffer = []

            X_gen = X()
            if Y:
                Y_gen = Y()
                data_zip = zip(X_gen, Y_gen)
            else:
                data_zip = X_gen
            for data_example in data_zip:
                if Y:
                    x, y = data_example
                    Y_buffer.append(y)
                else:
                    x = data_example
                X_buffer.append(x)

                if len(X_buffer) >= self.buffer_size:
                    yield from yield_transformed_data(X_buffer, Y_buffer)

                    X_buffer = []
                    Y_buffer = []

            if X_buffer:
                yield from yield_transformed_data(X_buffer, Y_buffer)

        output_types = (tf.int32, tf.int32) if Y else (tf.int32)
        data = tf.data.Dataset.from_generator(data_gen,
                                              output_types=output_types)

        if self.data_cache:
            data = data.cache(self.data_cache)
        return data

    def _init_vectorizer(self):
        self.vectorizer = KerasVectorizer(vocab_size=5_000,
                                          sequence_length=400)

    def _init_classifier(self):
        self.classifier = CNNClassifier(
            learning_rate=0.01,
            dropout=0.1,
            sparse_y=True,
            nb_epochs=20,
            nb_layers=4,
            multilabel=True,
            threshold=self.threshold,
            batch_size=self.batch_size,
        )

    def set_params(self, **params):
        if not hasattr(self, "vectorizer"):
            self._init_vectorizer()
        if not hasattr(self, "classifier"):
            self._init_classifier()
        vec_params = get_params_for_component(params, "vec")
        clf_params = get_params_for_component(params, "cnn")
        self.vectorizer.set_params(**vec_params)
        self.classifier.set_params(**clf_params)

    def fit(self, X, Y):
        """
        X: list or generator of texts
        Y: 2d numpy array or sparse csr_matrix or generator of 2d numpy array of tags assigned

        If X is a generator it need to be callable i.e. return
        the generator by calling it X_gen = X(). This is so
        we can iterate on the data again.
        """
        if not hasattr(self, "vectorizer"):
            self._init_vectorizer()
        if not hasattr(self, "classifier"):
            self._init_classifier()

        if type(X) in [list, np.ndarray]:
            print("Fitting vectorizer")
            self.vectorizer.fit(X)
            X_vec = self.vectorizer.transform(X)
            print(X_vec.shape)
            print("Fitting classifier")
            self.classifier.fit(X_vec, Y)
        else:
            print("Fitting vectorizer")
            X_gen = X()
            self.vectorizer.fit(X_gen)
            print("Fitting classifier")
            params_from_vectorizer = {
                "sequence_length": self.vectorizer.sequence_length,
                "vocab_size": self.vectorizer.vocab_size,
            }
            self.classifier.set_params(**params_from_vectorizer)
            train_data = self._yield_data(X, self.vectorizer, Y)
            # TODO: This should move inside CNNClassifier
            if self.shuffle:
                train_data = train_data.shuffle(self.buffer_size,
                                                seed=self.random_seed)
            self.classifier.fit(train_data)

        return self

    def predict(self, X):
        if type(X) in [list, np.ndarray]:
            X_vec = self.vectorizer.transform(X)
            Y_pred = self.classifier.predict(X_vec)
        else:
            pred_data = self._yield_data(X, self.vectorizer)
            Y_pred = self.classifier.predict(pred_data)
        return Y_pred

    def predict_proba(self, X):
        if type(X) in [list, np.ndarray]:
            X_vec = self.vectorizer.transform(X)
            Y_pred_proba = []
            for i in range(0, X_vec.shape[0], self.batch_size):
                Y_pred_proba_batch = self.classifier.predict_proba(
                    X_vec[i:i + self.batch_size])
                Y_pred_proba.append(Y_pred_proba_batch)
            Y_pred_proba = np.hstack(Y_pred_proba)
        else:
            pred_data = self._yield_data(X, self.vectorizer)
            Y_pred_proba = self.classifier.predict_proba(pred_data)
        return Y_pred_proba

    def save(self, model_path):
        if not os.path.exists(model_path):
            os.mkdir(model_path)

        meta = {"name": "MeshCNN", "approach": "mesh-cnn"}
        meta_path = os.path.join(model_path, "meta.json")
        with open(meta_path, "w") as f:
            f.write(json.dumps(meta))

        vectorizer_path = os.path.join(model_path, "vectorizer.pkl")
        save_pickle(vectorizer_path, self.vectorizer)
        self.classifier.save(model_path)

    def load(self, model_path):
        meta_path = os.path.join(model_path, "meta.json")
        with open(meta_path, "r") as f:
            meta = json.loads(f.read())
        self.set_params(**meta)

        vectorizer_path = os.path.join(model_path, "vectorizer.pkl")
        self.vectorizer = load_pickle(vectorizer_path)

        self._init_classifier()
        self.classifier.load(model_path)