def __init__(self,
                 model_name: str,
                 model_path: Optional[str] = None) -> None:
        """
        Loads a model_name (e.g. en_pytt_xlnetbasecased_lg) or a combination
        of model name and local model path (e.g. xlnet-large-cased and /local/mlinde/xlnet-large-cased)
        see https://github.com/explosion/spacy-pytorch-transformers#loading-models-from-a-path for how to prepare a model
        :param model_name:
        :param model_path:
        """
        super().__init__()
        with SwitchDefaultTensor():
            if model_path:
                self.nlp = PyTT_Language(pytt_name=model_name,
                                         meta={"lang": "en"})
                self.nlp.add_pipe(self.nlp.create_pipe("sentencizer"))
                self.nlp.add_pipe(
                    PyTT_WordPiecer.from_pretrained(self.nlp.vocab,
                                                    model_name))
                self.nlp.add_pipe(
                    PyTT_TokenVectorEncoder.from_pretrained(
                        self.nlp.vocab, model_path))

            else:
                self.nlp = spacy.load(model_name)
        if model_name not in NAME_TO_DIM:
            raise ValueError("Model name is unknown, I know " +
                             str(list(NAME_TO_DIM.keys())))
        self.output_dim = NAME_TO_DIM[model_name]
Example #2
0
def nlp(name):
    p_nlp = PyTT_Language(pytt_name=name)
    p_nlp.add_pipe(p_nlp.create_pipe("sentencizer"))
    p_nlp.add_pipe(PyTT_WordPiecer.from_pretrained(p_nlp.vocab,
                                                   pytt_name=name))
    p_nlp.add_pipe(
        PyTT_TokenVectorEncoder.from_pretrained(p_nlp.vocab, name=name))
    return p_nlp
    def __init__(self, model_name: str, model_path: Optional[str]) -> None:
        super().__init__()
        with SwitchDefaultTensor():
            if model_path:
                self.nlp = PyTT_Language(pytt_name=model_name,
                                         meta={"lang": "en"})
                self.nlp.add_pipe(self.nlp.create_pipe("sentencizer"))
                self.nlp.add_pipe(
                    PyTT_WordPiecer.from_pretrained(self.nlp.vocab,
                                                    model_name))
                self.nlp.add_pipe(
                    PyTT_TokenVectorEncoder.from_pretrained(
                        self.nlp.vocab, model_path))

            else:
                self.nlp = spacy.load(model_name)
        if not model_name in NAME_TO_DIM:
            raise ValueError("Model name is unknown, I know " +
                             str(list(NAME_TO_DIM.keys())))
        self.output_dim = NAME_TO_DIM[model_name]
class SpacyTokenToVec(TokenToVec):
    def __init__(self, model_name: str, model_path: Optional[str]) -> None:
        super().__init__()
        with SwitchDefaultTensor():
            if model_path:
                self.nlp = PyTT_Language(pytt_name=model_name,
                                         meta={"lang": "en"})
                self.nlp.add_pipe(self.nlp.create_pipe("sentencizer"))
                self.nlp.add_pipe(
                    PyTT_WordPiecer.from_pretrained(self.nlp.vocab,
                                                    model_name))
                self.nlp.add_pipe(
                    PyTT_TokenVectorEncoder.from_pretrained(
                        self.nlp.vocab, model_path))

            else:
                self.nlp = spacy.load(model_name)
        if not model_name in NAME_TO_DIM:
            raise ValueError("Model name is unknown, I know " +
                             str(list(NAME_TO_DIM.keys())))
        self.output_dim = NAME_TO_DIM[model_name]

    def get_output_dim(self) -> int:
        return self.output_dim

    def embed(self, vocab: Vocabulary, tokens: torch.Tensor) -> torch.Tensor:
        """
        Idea: reconstruct string tokens from token ids -> feed to spacy -> return tensors
        :param vocab:
        :param tokens:
        :return:
        """
        with SwitchDefaultTensor():
            embedded_sentences = []
            tokens_cpu = tokens.cpu()
            batch_size, seq_len = tokens.shape
            for sentence in tokens_cpu:
                str_tokens: List[str] = [
                    vocab.get_token_from_index(int(token))
                    for token in sentence if token != 0
                ]  #skip padding
                doc = Doc(self.nlp.vocab, words=str_tokens)
                self.nlp.pipeline[1][1](doc)  #word pieces
                self.nlp.pipeline[2][1](doc)  #run transformer on wordpieces
                #add padding back in
                #embedded = torch.from_numpy(cupy.asnumpy(doc.tensor)).to(device) # shape (str_tokens, output dim)
                embedded = from_dlpack(
                    doc.tensor.toDlpack())  # shape (str_tokens, output dim)
                assert embedded.shape == (len(str_tokens),
                                          self.get_output_dim())
                if seq_len - len(str_tokens) > 0:
                    padded = torch.zeros(seq_len - len(str_tokens),
                                         self.get_output_dim())
                    embedded = torch.cat([embedded, padded], dim=0)
                embedded_sentences.append(embedded)
            return torch.stack(embedded_sentences, dim=0)
Example #5
0
def test_language_init(name):
    meta = {"lang": "en", "name": "test", "pipeline": []}
    nlp = PyTT_Language(meta=meta, pytt_name=name)
    assert nlp.lang == "en"
    assert nlp.meta["lang"] == "en"
    assert nlp.meta["lang_factory"] == PyTT_Language.lang_factory_name
    assert nlp.vocab.lang == "en"
    # Make sure we really have the EnglishDefaults here
    assert nlp.Defaults.lex_attr_getters[LANG](None) == "en"
    # Test requirements
    package = f"{about.__title__}>={about.__version__}"
    assert package in nlp.meta["requirements"]
Example #6
0
def test_language_to_from_disk(nlp, name):
    doc = nlp("hello world")
    assert is_valid_tensor(doc.tensor)
    with make_tempdir() as tempdir:
        nlp.to_disk(tempdir)
        new_nlp = PyTT_Language()
        new_nlp.add_pipe(new_nlp.create_pipe("sentencizer"))
        wordpiecer = PyTT_WordPiecer(new_nlp.vocab, pytt_name=name)
        tok2vec = PyTT_TokenVectorEncoder(new_nlp.vocab, pytt_name=name)
        new_nlp.add_pipe(wordpiecer)
        new_nlp.add_pipe(tok2vec)
        new_nlp.from_disk(tempdir)
    assert new_nlp.pipe_names == nlp.pipe_names
    new_doc = new_nlp("hello world")
    assert is_valid_tensor(new_doc.tensor)
    assert_equal(doc.tensor, new_doc.tensor)
Example #7
0
def test_language_wordpiece_tok2vec_to_from_bytes(nlp, name):
    doc = nlp("hello world")
    assert is_valid_tensor(doc.tensor)
    nlp2 = PyTT_Language()
    nlp2.add_pipe(nlp2.create_pipe("sentencizer"))
    nlp2.add_pipe(PyTT_WordPiecer(nlp.vocab))
    nlp2.add_pipe(PyTT_TokenVectorEncoder(nlp.vocab))
    with pytest.raises(ValueError):
        new_doc = nlp2("hello world")
    nlp2.from_bytes(nlp.to_bytes())
    new_doc = nlp2("hello world")
    assert is_valid_tensor(new_doc.tensor)
    assert new_doc._.pytt_word_pieces is not None
Example #8
0
def test_language_wordpiece_to_from_bytes(name):
    nlp = PyTT_Language()
    nlp.add_pipe(nlp.create_pipe("sentencizer"))
    wordpiecer = PyTT_WordPiecer.from_pretrained(nlp.vocab, pytt_name=name)
    nlp.add_pipe(wordpiecer)
    doc = nlp("hello world")
    assert doc._.pytt_word_pieces is not None
    nlp2 = PyTT_Language()
    nlp2.add_pipe(nlp.create_pipe("sentencizer"))
    nlp2.add_pipe(PyTT_WordPiecer(nlp2.vocab))
    with pytest.raises(ValueError):
        new_doc = nlp2("hello world")
    nlp2.from_bytes(nlp.to_bytes())
    new_doc = nlp2("hello world")
    assert new_doc._.pytt_word_pieces is not None
Example #9
0
from spacy_pytorch_transformers import PyTT_Language, PyTT_WordPiecer, PyTT_TokenVectorEncoder
from pathlib import Path

pytorch_path = str(Path.home() / "pytorch-rubert")
spacy_path = str(Path.home() / "spacy-rubert")
name = "ru_pytt_rubert_cased"

nlp = PyTT_Language(pytt_name=name, meta={"lang": "ru"})
nlp.add_pipe(nlp.create_pipe("sentencizer"))
nlp.add_pipe(PyTT_WordPiecer.from_pretrained(nlp.vocab, pytorch_path))
nlp.add_pipe(PyTT_TokenVectorEncoder.from_pretrained(nlp.vocab, pytorch_path))
print(nlp.pipe_names)
nlp.to_disk(spacy_path)
class BatchedSpacyTokenToVec(TokenToVec):
    def __init__(self,
                 model_name: str,
                 model_path: Optional[str] = None) -> None:
        """
        Loads a model_name (e.g. en_pytt_xlnetbasecased_lg) or a combination
        of model name and local model path (e.g. xlnet-large-cased and /local/mlinde/xlnet-large-cased)
        see https://github.com/explosion/spacy-pytorch-transformers#loading-models-from-a-path for how to prepare a model
        :param model_name:
        :param model_path:
        """
        super().__init__()
        with SwitchDefaultTensor():
            if model_path:
                self.nlp = PyTT_Language(pytt_name=model_name,
                                         meta={"lang": "en"})
                self.nlp.add_pipe(self.nlp.create_pipe("sentencizer"))
                self.nlp.add_pipe(
                    PyTT_WordPiecer.from_pretrained(self.nlp.vocab,
                                                    model_name))
                self.nlp.add_pipe(
                    PyTT_TokenVectorEncoder.from_pretrained(
                        self.nlp.vocab, model_path))

            else:
                self.nlp = spacy.load(model_name)
        if model_name not in NAME_TO_DIM:
            raise ValueError("Model name is unknown, I know " +
                             str(list(NAME_TO_DIM.keys())))
        self.output_dim = NAME_TO_DIM[model_name]

    def get_output_dim(self) -> int:
        return self.output_dim

    def embed(self, vocab: Vocabulary, tokens: torch.Tensor) -> torch.Tensor:
        """
        Idea: reconstruct string tokens from token ids -> feed to spacy -> return tensors
        :param vocab:
        :param tokens:
        :return:
        """
        with SwitchDefaultTensor():
            with torch.autograd.no_grad():
                embedded_sentences = []
                tokens_cpu = tokens.cpu()
                batch_size, seq_len = tokens.shape
                sents = []
                for sentence in tokens_cpu:
                    str_tokens: List[str] = [
                        vocab.get_token_from_index(int(token))
                        for token in sentence if token != 0
                    ]  #skip padding
                    sents.append(str_tokens)
                doc = make_doc(self.nlp.vocab, sents)
                self.nlp.pipeline[1][1](doc)  #word pieces
                self.nlp.pipeline[2][1](doc)  #run transformer on wordpieces

                #Now iterate over sentences in correct order and cut out the correct tensor + pad it
                for sent, str_tokens in zip(doc.sents, sents):
                    #add padding back in
                    embedded = from_dlpack(sent.tensor.toDlpack()
                                           )  # shape (str_tokens, output dim)
                    if seq_len - len(str_tokens) > 0:
                        padded = torch.zeros(seq_len - len(str_tokens),
                                             self.get_output_dim())
                        embedded = torch.cat([embedded, padded], dim=0)
                    embedded_sentences.append(embedded)
                return torch.stack(embedded_sentences, dim=0)
print('N devices: {}'.format(torch.cuda.device_count()))
print(torch.cuda.get_device_name(0))
print("is available? {}".format(torch.cuda.is_available()))

###################
# Configure Spacy for vanilla BERT

start_time = time.time()

print("Spacy GPU? {}".format(is_using_gpu))
if is_using_gpu:
    torch.set_default_tensor_type("torch.cuda.FloatTensor")

name = "scibert-scivocab-uncased"
path = "/project2/jevans/brendan/pretrained_transformers/scibert-scivocab-uncased"
nlp = PyTT_Language(pytt_name=name, meta={"lang": "en"})
nlp.add_pipe(nlp.create_pipe("sentencizer"))
nlp.add_pipe(PyTT_WordPiecer.from_pretrained(nlp.vocab, path))
nlp.add_pipe(PyTT_TokenVectorEncoder.from_pretrained(nlp.vocab, path))

end_time = time.time()
print('elapsed (s): {}'.format(end_time - start_time))

####################
# Encode abstracts from the validation set

start_time = time.time()

chunk_filepath = '/project2/jevans/brendan/pubmed_data_processing/validation_sets/'
chunk_filename = chunk_filepath + 'jneurophysiol_vs_neuroimage.csv'
embedding_subchunk_size = 10000  # do 10000 abstracts at a time
Example #12
0
def main(path, name="bert-base-uncased", lang="en"):
    msg = Printer()
    msg.info(f"Creating model for '{name}' ({lang})")
    with msg.loading(f"Setting up the pipeline..."):
        nlp = PyTT_Language(pytt_name=name, meta={"lang": lang})
        nlp.add_pipe(nlp.create_pipe("sentencizer"))
        nlp.add_pipe(PyTT_WordPiecer.from_pretrained(nlp.vocab, name))
        nlp.add_pipe(PyTT_TokenVectorEncoder.from_pretrained(nlp.vocab, name))
    msg.good("Initialized the model pipeline")
    nlp.to_disk(path)
    msg.good(f"Saved '{name}' ({lang})")
    msg.text(f"Pipeline: {nlp.pipe_names}")
    msg.text(f"Location: {path}")
    with msg.loading("Verifying model loads..."):
        nlp.from_disk(path)
    msg.good("Model loads!")