def _init_nlp(self): if self.pretrained == "bert": self.nlp = spacy.load("en_trf_bertbaseuncased_lg") elif self.pretrained == "scibert": name = "scibert-scivocab-uncased" path = "models/scibert_scivocab_uncased" nlp = TransformersLanguage(trf_name=name, meta={"lang": "en"}) nlp.add_pipe(nlp.create_pipe("sentencizer")) nlp.add_pipe(TransformersWordPiecer.from_pretrained(nlp.vocab, path)) nlp.add_pipe(TransformersTok2Vec.from_pretrained(nlp.vocab, path)) self.nlp = nlp else: logger.info(f"{self.pretrained} is not among bert, scibert") raise # TODO: Add a parameter for exclusive classes, non multilabel scenario self.textcat = self.nlp.create_pipe( "trf_textcat", config={"exclusive_classes": False, "architecture": "sigmoid_last_hidden"}, ) self.nlp.add_pipe(self.textcat, last=True) for label in self.unique_labels: self.textcat.add_label(label)
def nlp(name): p_nlp = TransformersLanguage(trf_name=name) p_nlp.add_pipe(p_nlp.create_pipe("sentencizer")) p_nlp.add_pipe( TransformersWordPiecer.from_pretrained(p_nlp.vocab, trf_name=name)) p_nlp.add_pipe(TransformersTok2Vec.from_pretrained(p_nlp.vocab, name=name)) return p_nlp
def test_tok2vec_to_from_disk(tok2vec, docs): doc = tok2vec(docs[0]) assert is_valid_tensor(doc.tensor) with make_tempdir() as tempdir: file_path = tempdir / "tok2vec" tok2vec.to_disk(file_path) new_tok2vec = TransformersTok2Vec(Vocab()) new_tok2vec.from_disk(file_path) new_doc = new_tok2vec(docs[0]) assert is_valid_tensor(new_doc.tensor) assert_equal(doc.tensor, new_doc.tensor)
def test_tok2vec_to_from_bytes(tok2vec, docs): doc = tok2vec(docs[0]) assert is_valid_tensor(doc.tensor) bytes_data = tok2vec.to_bytes() new_tok2vec = TransformersTok2Vec(Vocab(), **tok2vec.cfg) with pytest.raises(ValueError): new_doc = new_tok2vec(docs[0]) new_tok2vec.from_bytes(bytes_data) new_doc = new_tok2vec(docs[0]) assert is_valid_tensor(new_doc.tensor) assert_equal(doc.tensor, new_doc.tensor)
def test_language_wordpiece_tok2vec_to_from_bytes(nlp, name): doc = nlp("hello world") assert is_valid_tensor(doc.tensor) nlp2 = TransformersLanguage() nlp2.add_pipe(nlp2.create_pipe("sentencizer")) nlp2.add_pipe(TransformersWordPiecer(nlp.vocab)) nlp2.add_pipe(TransformersTok2Vec(nlp.vocab)) with pytest.raises(ValueError): new_doc = nlp2("hello world") nlp2.from_bytes(nlp.to_bytes()) new_doc = nlp2("hello world") assert is_valid_tensor(new_doc.tensor) assert new_doc._.get(ATTRS.word_pieces) is not None
def test_language_to_from_disk(nlp, name): doc = nlp("hello world") assert is_valid_tensor(doc.tensor) with make_tempdir() as tempdir: nlp.to_disk(tempdir) new_nlp = TransformersLanguage() new_nlp.add_pipe(new_nlp.create_pipe("sentencizer")) wordpiecer = TransformersWordPiecer(new_nlp.vocab, trf_name=name) tok2vec = TransformersTok2Vec(new_nlp.vocab, trf_name=name) new_nlp.add_pipe(wordpiecer) new_nlp.add_pipe(tok2vec) new_nlp.from_disk(tempdir) assert new_nlp.pipe_names == nlp.pipe_names new_doc = new_nlp("hello world") assert is_valid_tensor(new_doc.tensor) assert_equal(doc.tensor, new_doc.tensor)
def main(path, name="bert-base-uncased", lang="en"): msg = Printer() msg.info(f"Creating model for '{name}' ({lang})") with msg.loading(f"Setting up the pipeline..."): nlp = TransformersLanguage(trf_name=name, meta={"lang": lang}) nlp.add_pipe(nlp.create_pipe("sentencizer")) nlp.add_pipe(TransformersWordPiecer.from_pretrained(nlp.vocab, name)) nlp.add_pipe(TransformersTok2Vec.from_pretrained(nlp.vocab, name)) msg.good("Initialized the model pipeline") nlp.to_disk(path) msg.good(f"Saved '{name}' ({lang})") msg.text(f"Pipeline: {nlp.pipe_names}") msg.text(f"Location: {path}") with msg.loading("Verifying model loads..."): nlp.from_disk(path) msg.good("Model loads!")
) # experiments showed comparable to BM25 and BM25 much cheaper USE_SCIBERT = False # scispacy and scibert spacy_nlp = spacy.load("en_core_sci_lg") if USE_SCIBERT: path = os.path.join(os.path.dirname(__file__), "../..", "resources", "scibert_scivocab_uncased") spacy_nlp.add_pipe( TransformersWordPiecer.from_pretrained(spacy_nlp.vocab, path)) spacy_nlp.add_pipe( TransformersTok2Vec.from_pretrained(spacy_nlp.vocab, path)) def parse(doc): if isinstance(doc, str): doc = spacy_nlp(doc) return doc def remove_stop_words_from_nl(doc): doc = parse(doc) return [t for t in doc if not t.is_stop] def vectorize_tokens(tokens): # TODO: agh, hack, but want same vector
import numpy as np from tensorflow.keras.models import model_from_json from tensorflow.keras.preprocessing import sequence from bert import BertModelLayer import bert from spacy_transformers import TransformersLanguage, TransformersWordPiecer, TransformersTok2Vec from load_transform import extract_from_mongodb, extract_data_from_json import logging # spacy-transformers pipeline for preprocessing name = "bert-base-uncased" nlp = TransformersLanguage(trf_name=name, meta={"lang": "en"}) nlp.add_pipe(nlp.create_pipe("sentencizer")) nlp.add_pipe(TransformersWordPiecer.from_pretrained(nlp.vocab, name)) nlp.add_pipe(TransformersTok2Vec.from_pretrained(nlp.vocab, name)) def load_bert_model(): """ load the saved trained bert model """ logging.critical("Loading BERT model...") json_file = open('model.json', 'r') loaded_model_json = json_file.read() json_file.close() loaded_model = model_from_json(loaded_model_json, custom_objects={"BertModelLayer": bert.BertModelLayer}) # load weights into new model loaded_model.load_weights("model.h5") logging.critical("Model is ready.") return loaded_model