def _init_nlp(self):
        if self.pretrained == "bert":
            self.nlp = spacy.load("en_trf_bertbaseuncased_lg")
        elif self.pretrained == "scibert":
            name = "scibert-scivocab-uncased"
            path = "models/scibert_scivocab_uncased"

            nlp = TransformersLanguage(trf_name=name, meta={"lang": "en"})
            nlp.add_pipe(nlp.create_pipe("sentencizer"))
            nlp.add_pipe(TransformersWordPiecer.from_pretrained(nlp.vocab, path))
            nlp.add_pipe(TransformersTok2Vec.from_pretrained(nlp.vocab, path))
            self.nlp = nlp
        else:
            logger.info(f"{self.pretrained} is not among bert, scibert")
            raise
        # TODO: Add a parameter for exclusive classes, non multilabel scenario
        self.textcat = self.nlp.create_pipe(
            "trf_textcat",
            config={"exclusive_classes": False, "architecture": "sigmoid_last_hidden"},
        )

        self.nlp.add_pipe(self.textcat, last=True)

        for label in self.unique_labels:
            self.textcat.add_label(label)
Example #2
0
def nlp(name):
    p_nlp = TransformersLanguage(trf_name=name)
    p_nlp.add_pipe(p_nlp.create_pipe("sentencizer"))
    p_nlp.add_pipe(
        TransformersWordPiecer.from_pretrained(p_nlp.vocab, trf_name=name))
    p_nlp.add_pipe(TransformersTok2Vec.from_pretrained(p_nlp.vocab, name=name))
    return p_nlp
def test_tok2vec_to_from_disk(tok2vec, docs):
    doc = tok2vec(docs[0])
    assert is_valid_tensor(doc.tensor)
    with make_tempdir() as tempdir:
        file_path = tempdir / "tok2vec"
        tok2vec.to_disk(file_path)
        new_tok2vec = TransformersTok2Vec(Vocab())
        new_tok2vec.from_disk(file_path)
    new_doc = new_tok2vec(docs[0])
    assert is_valid_tensor(new_doc.tensor)
    assert_equal(doc.tensor, new_doc.tensor)
def test_tok2vec_to_from_bytes(tok2vec, docs):
    doc = tok2vec(docs[0])
    assert is_valid_tensor(doc.tensor)
    bytes_data = tok2vec.to_bytes()
    new_tok2vec = TransformersTok2Vec(Vocab(), **tok2vec.cfg)
    with pytest.raises(ValueError):
        new_doc = new_tok2vec(docs[0])
    new_tok2vec.from_bytes(bytes_data)
    new_doc = new_tok2vec(docs[0])
    assert is_valid_tensor(new_doc.tensor)
    assert_equal(doc.tensor, new_doc.tensor)
def test_language_wordpiece_tok2vec_to_from_bytes(nlp, name):
    doc = nlp("hello world")
    assert is_valid_tensor(doc.tensor)
    nlp2 = TransformersLanguage()
    nlp2.add_pipe(nlp2.create_pipe("sentencizer"))
    nlp2.add_pipe(TransformersWordPiecer(nlp.vocab))
    nlp2.add_pipe(TransformersTok2Vec(nlp.vocab))
    with pytest.raises(ValueError):
        new_doc = nlp2("hello world")
    nlp2.from_bytes(nlp.to_bytes())
    new_doc = nlp2("hello world")
    assert is_valid_tensor(new_doc.tensor)
    assert new_doc._.get(ATTRS.word_pieces) is not None
def test_language_to_from_disk(nlp, name):
    doc = nlp("hello world")
    assert is_valid_tensor(doc.tensor)
    with make_tempdir() as tempdir:
        nlp.to_disk(tempdir)
        new_nlp = TransformersLanguage()
        new_nlp.add_pipe(new_nlp.create_pipe("sentencizer"))
        wordpiecer = TransformersWordPiecer(new_nlp.vocab, trf_name=name)
        tok2vec = TransformersTok2Vec(new_nlp.vocab, trf_name=name)
        new_nlp.add_pipe(wordpiecer)
        new_nlp.add_pipe(tok2vec)
        new_nlp.from_disk(tempdir)
    assert new_nlp.pipe_names == nlp.pipe_names
    new_doc = new_nlp("hello world")
    assert is_valid_tensor(new_doc.tensor)
    assert_equal(doc.tensor, new_doc.tensor)
def main(path, name="bert-base-uncased", lang="en"):
    msg = Printer()
    msg.info(f"Creating model for '{name}' ({lang})")
    with msg.loading(f"Setting up the pipeline..."):
        nlp = TransformersLanguage(trf_name=name, meta={"lang": lang})
        nlp.add_pipe(nlp.create_pipe("sentencizer"))
        nlp.add_pipe(TransformersWordPiecer.from_pretrained(nlp.vocab, name))
        nlp.add_pipe(TransformersTok2Vec.from_pretrained(nlp.vocab, name))
    msg.good("Initialized the model pipeline")
    nlp.to_disk(path)
    msg.good(f"Saved '{name}' ({lang})")
    msg.text(f"Pipeline: {nlp.pipe_names}")
    msg.text(f"Location: {path}")
    with msg.loading("Verifying model loads..."):
        nlp.from_disk(path)
    msg.good("Model loads!")
Example #8
0
)

# experiments showed comparable to BM25 and BM25 much cheaper
USE_SCIBERT = False

# scispacy and scibert
spacy_nlp = spacy.load("en_core_sci_lg")

if USE_SCIBERT:
    path = os.path.join(os.path.dirname(__file__), "../..", "resources",
                        "scibert_scivocab_uncased")

    spacy_nlp.add_pipe(
        TransformersWordPiecer.from_pretrained(spacy_nlp.vocab, path))
    spacy_nlp.add_pipe(
        TransformersTok2Vec.from_pretrained(spacy_nlp.vocab, path))


def parse(doc):
    if isinstance(doc, str):
        doc = spacy_nlp(doc)
    return doc


def remove_stop_words_from_nl(doc):
    doc = parse(doc)
    return [t for t in doc if not t.is_stop]


def vectorize_tokens(tokens):
    # TODO: agh, hack, but want same vector
Example #9
0
import numpy as np
from tensorflow.keras.models import model_from_json
from tensorflow.keras.preprocessing import sequence
from bert import BertModelLayer
import bert
from spacy_transformers import TransformersLanguage, TransformersWordPiecer, TransformersTok2Vec
from load_transform import extract_from_mongodb, extract_data_from_json
import logging

# spacy-transformers pipeline for preprocessing
name = "bert-base-uncased"
nlp = TransformersLanguage(trf_name=name, meta={"lang": "en"})
nlp.add_pipe(nlp.create_pipe("sentencizer"))
nlp.add_pipe(TransformersWordPiecer.from_pretrained(nlp.vocab, name))
nlp.add_pipe(TransformersTok2Vec.from_pretrained(nlp.vocab, name))


def load_bert_model():
    """
    load the saved trained bert model
    """
    logging.critical("Loading BERT model...")
    json_file = open('model.json', 'r')
    loaded_model_json = json_file.read()
    json_file.close()
    loaded_model = model_from_json(loaded_model_json, custom_objects={"BertModelLayer": bert.BertModelLayer})
    # load weights into new model
    loaded_model.load_weights("model.h5")
    logging.critical("Model is ready.")
    return loaded_model