Ejemplo n.º 1
0
def test_language_wordpiece_to_from_bytes(name):
    nlp = TransformersLanguage()
    nlp.add_pipe(nlp.create_pipe("sentencizer"))
    wordpiecer = TransformersWordPiecer.from_pretrained(nlp.vocab,
                                                        trf_name=name)
    nlp.add_pipe(wordpiecer)
    doc = nlp("hello world")
    assert doc._.get(ATTRS.word_pieces) is not None
    nlp2 = TransformersLanguage()
    nlp2.add_pipe(nlp.create_pipe("sentencizer"))
    nlp2.add_pipe(TransformersWordPiecer(nlp2.vocab))
    with pytest.raises(ValueError):
        new_doc = nlp2("hello world")
    nlp2.from_bytes(nlp.to_bytes())
    new_doc = nlp2("hello world")
    assert new_doc._.get(ATTRS.word_pieces) is not None
Ejemplo n.º 2
0
    def _init_nlp(self):
        if self.pretrained == "bert":
            self.nlp = spacy.load("en_trf_bertbaseuncased_lg")
        elif self.pretrained == "scibert":
            name = "scibert-scivocab-uncased"
            path = "models/scibert_scivocab_uncased"

            nlp = TransformersLanguage(trf_name=name, meta={"lang": "en"})
            nlp.add_pipe(nlp.create_pipe("sentencizer"))
            nlp.add_pipe(TransformersWordPiecer.from_pretrained(nlp.vocab, path))
            nlp.add_pipe(TransformersTok2Vec.from_pretrained(nlp.vocab, path))
            self.nlp = nlp
        else:
            logger.info(f"{self.pretrained} is not among bert, scibert")
            raise
        # TODO: Add a parameter for exclusive classes, non multilabel scenario
        self.textcat = self.nlp.create_pipe(
            "trf_textcat",
            config={"exclusive_classes": False, "architecture": "sigmoid_last_hidden"},
        )

        self.nlp.add_pipe(self.textcat, last=True)

        for label in self.unique_labels:
            self.textcat.add_label(label)
Ejemplo n.º 3
0
def nlp(name):
    p_nlp = TransformersLanguage(trf_name=name)
    p_nlp.add_pipe(p_nlp.create_pipe("sentencizer"))
    p_nlp.add_pipe(
        TransformersWordPiecer.from_pretrained(p_nlp.vocab, trf_name=name))
    p_nlp.add_pipe(TransformersTok2Vec.from_pretrained(p_nlp.vocab, name=name))
    return p_nlp
Ejemplo n.º 4
0
def main(
    name="bert-base-uncased",
    n_texts=1000,
    lang="en",
    skip=False,
    retry=False,
    force=False,
):
    """Test the wordpiecer on a large dataset to find misalignments. If both the
    retry and force flag are set (which is the default runtime configuration),
    this script should always pass.

    * retry: If alignment fails after cleaning and normalizing both sets of
        tokens, try again with a more aggressive strategy that strips out all
        characters that are not uppercase/lowercase letters.
    * force: If alignment still fails, run the word-piece tokenizer on the
        individual spaCy tokens, so that alignment is trivial. This should
        always work.
    """
    cfg = {"retry_alignment": retry, "force_alignment": force}
    nlp = get_lang_class(lang)()
    nlp.add_pipe(nlp.create_pipe("sentencizer"))
    wp = TransformersWordPiecer.from_pretrained(nlp.vocab,
                                                trf_name=name,
                                                **cfg)
    msg.good(f"Loaded WordPiecer for model '{name}'")
    with msg.loading("Loading IMDB data..."):
        data, _ = thinc.extra.datasets.imdb(limit=n_texts)
    texts, _ = zip(*data)
    msg.good(f"Using {len(texts)} texts from IMDB data")
    msg.info("Processing texts...")
    sent_counts = 0
    for doc in tqdm.tqdm(nlp.pipe(texts), total=len(texts)):
        try:
            doc = wp(doc)
            sent_counts += len(list(doc.sents))
        except AssertionError as e:
            if len(e.args) and isinstance(e.args[0],
                                          tuple):  # Misaligned error
                a, b = e.args[0]
                msg.fail("Misaligned tokens")
                print(diff_strings(a, b))
                if not skip:
                    sys.exit(1)
            elif len(e.args):
                msg.fail(f"Error: {e.args[0]}", exits=None if skip else 1)
            else:
                if skip:
                    print(e)
                else:
                    raise e
    msg.good(f"Processed {len(texts)} documents ({sent_counts} sentences)")
Ejemplo n.º 5
0
def test_language_wordpiece_tok2vec_to_from_bytes(nlp, name):
    doc = nlp("hello world")
    assert is_valid_tensor(doc.tensor)
    nlp2 = TransformersLanguage()
    nlp2.add_pipe(nlp2.create_pipe("sentencizer"))
    nlp2.add_pipe(TransformersWordPiecer(nlp.vocab))
    nlp2.add_pipe(TransformersTok2Vec(nlp.vocab))
    with pytest.raises(ValueError):
        new_doc = nlp2("hello world")
    nlp2.from_bytes(nlp.to_bytes())
    new_doc = nlp2("hello world")
    assert is_valid_tensor(new_doc.tensor)
    assert new_doc._.get(ATTRS.word_pieces) is not None
Ejemplo n.º 6
0
def test_language_to_from_disk(nlp, name):
    doc = nlp("hello world")
    assert is_valid_tensor(doc.tensor)
    with make_tempdir() as tempdir:
        nlp.to_disk(tempdir)
        new_nlp = TransformersLanguage()
        new_nlp.add_pipe(new_nlp.create_pipe("sentencizer"))
        wordpiecer = TransformersWordPiecer(new_nlp.vocab, trf_name=name)
        tok2vec = TransformersTok2Vec(new_nlp.vocab, trf_name=name)
        new_nlp.add_pipe(wordpiecer)
        new_nlp.add_pipe(tok2vec)
        new_nlp.from_disk(tempdir)
    assert new_nlp.pipe_names == nlp.pipe_names
    new_doc = new_nlp("hello world")
    assert is_valid_tensor(new_doc.tensor)
    assert_equal(doc.tensor, new_doc.tensor)
Ejemplo n.º 7
0
def main(path, name="bert-base-uncased", lang="en"):
    msg = Printer()
    msg.info(f"Creating model for '{name}' ({lang})")
    with msg.loading(f"Setting up the pipeline..."):
        nlp = TransformersLanguage(trf_name=name, meta={"lang": lang})
        nlp.add_pipe(nlp.create_pipe("sentencizer"))
        nlp.add_pipe(TransformersWordPiecer.from_pretrained(nlp.vocab, name))
        nlp.add_pipe(TransformersTok2Vec.from_pretrained(nlp.vocab, name))
    msg.good("Initialized the model pipeline")
    nlp.to_disk(path)
    msg.good(f"Saved '{name}' ({lang})")
    msg.text(f"Pipeline: {nlp.pipe_names}")
    msg.text(f"Location: {path}")
    with msg.loading("Verifying model loads..."):
        nlp.from_disk(path)
    msg.good("Model loads!")
def wp(name):
    return TransformersWordPiecer.from_pretrained(Vocab(), trf_name=name)
Ejemplo n.º 9
0
    TransformersWordPiecer,
    TransformersTok2Vec,
)

# experiments showed comparable to BM25 and BM25 much cheaper
USE_SCIBERT = False

# scispacy and scibert
spacy_nlp = spacy.load("en_core_sci_lg")

if USE_SCIBERT:
    path = os.path.join(os.path.dirname(__file__), "../..", "resources",
                        "scibert_scivocab_uncased")

    spacy_nlp.add_pipe(
        TransformersWordPiecer.from_pretrained(spacy_nlp.vocab, path))
    spacy_nlp.add_pipe(
        TransformersTok2Vec.from_pretrained(spacy_nlp.vocab, path))


def parse(doc):
    if isinstance(doc, str):
        doc = spacy_nlp(doc)
    return doc


def remove_stop_words_from_nl(doc):
    doc = parse(doc)
    return [t for t in doc if not t.is_stop]

def wordpiecer(name):
    return TransformersWordPiecer.from_pretrained(vocab, trf_name=name)
Ejemplo n.º 11
0
import numpy as np
from tensorflow.keras.models import model_from_json
from tensorflow.keras.preprocessing import sequence
from bert import BertModelLayer
import bert
from spacy_transformers import TransformersLanguage, TransformersWordPiecer, TransformersTok2Vec
from load_transform import extract_from_mongodb, extract_data_from_json
import logging

# spacy-transformers pipeline for preprocessing
name = "bert-base-uncased"
nlp = TransformersLanguage(trf_name=name, meta={"lang": "en"})
nlp.add_pipe(nlp.create_pipe("sentencizer"))
nlp.add_pipe(TransformersWordPiecer.from_pretrained(nlp.vocab, name))
nlp.add_pipe(TransformersTok2Vec.from_pretrained(nlp.vocab, name))


def load_bert_model():
    """
    load the saved trained bert model
    """
    logging.critical("Loading BERT model...")
    json_file = open('model.json', 'r')
    loaded_model_json = json_file.read()
    json_file.close()
    loaded_model = model_from_json(loaded_model_json, custom_objects={"BertModelLayer": bert.BertModelLayer})
    # load weights into new model
    loaded_model.load_weights("model.h5")
    logging.critical("Model is ready.")
    return loaded_model