def __init__(self, model_name: str, model_path: Optional[str] = None) -> None: """ Loads a model_name (e.g. en_pytt_xlnetbasecased_lg) or a combination of model name and local model path (e.g. xlnet-large-cased and /local/mlinde/xlnet-large-cased) see https://github.com/explosion/spacy-pytorch-transformers#loading-models-from-a-path for how to prepare a model :param model_name: :param model_path: """ super().__init__() with SwitchDefaultTensor(): if model_path: self.nlp = PyTT_Language(pytt_name=model_name, meta={"lang": "en"}) self.nlp.add_pipe(self.nlp.create_pipe("sentencizer")) self.nlp.add_pipe( PyTT_WordPiecer.from_pretrained(self.nlp.vocab, model_name)) self.nlp.add_pipe( PyTT_TokenVectorEncoder.from_pretrained( self.nlp.vocab, model_path)) else: self.nlp = spacy.load(model_name) if model_name not in NAME_TO_DIM: raise ValueError("Model name is unknown, I know " + str(list(NAME_TO_DIM.keys()))) self.output_dim = NAME_TO_DIM[model_name]
def nlp(name): p_nlp = PyTT_Language(pytt_name=name) p_nlp.add_pipe(p_nlp.create_pipe("sentencizer")) p_nlp.add_pipe(PyTT_WordPiecer.from_pretrained(p_nlp.vocab, pytt_name=name)) p_nlp.add_pipe( PyTT_TokenVectorEncoder.from_pretrained(p_nlp.vocab, name=name)) return p_nlp
def __init__(self, model_name: str, model_path: Optional[str]) -> None: super().__init__() with SwitchDefaultTensor(): if model_path: self.nlp = PyTT_Language(pytt_name=model_name, meta={"lang": "en"}) self.nlp.add_pipe(self.nlp.create_pipe("sentencizer")) self.nlp.add_pipe( PyTT_WordPiecer.from_pretrained(self.nlp.vocab, model_name)) self.nlp.add_pipe( PyTT_TokenVectorEncoder.from_pretrained( self.nlp.vocab, model_path)) else: self.nlp = spacy.load(model_name) if not model_name in NAME_TO_DIM: raise ValueError("Model name is unknown, I know " + str(list(NAME_TO_DIM.keys()))) self.output_dim = NAME_TO_DIM[model_name]
class SpacyTokenToVec(TokenToVec): def __init__(self, model_name: str, model_path: Optional[str]) -> None: super().__init__() with SwitchDefaultTensor(): if model_path: self.nlp = PyTT_Language(pytt_name=model_name, meta={"lang": "en"}) self.nlp.add_pipe(self.nlp.create_pipe("sentencizer")) self.nlp.add_pipe( PyTT_WordPiecer.from_pretrained(self.nlp.vocab, model_name)) self.nlp.add_pipe( PyTT_TokenVectorEncoder.from_pretrained( self.nlp.vocab, model_path)) else: self.nlp = spacy.load(model_name) if not model_name in NAME_TO_DIM: raise ValueError("Model name is unknown, I know " + str(list(NAME_TO_DIM.keys()))) self.output_dim = NAME_TO_DIM[model_name] def get_output_dim(self) -> int: return self.output_dim def embed(self, vocab: Vocabulary, tokens: torch.Tensor) -> torch.Tensor: """ Idea: reconstruct string tokens from token ids -> feed to spacy -> return tensors :param vocab: :param tokens: :return: """ with SwitchDefaultTensor(): embedded_sentences = [] tokens_cpu = tokens.cpu() batch_size, seq_len = tokens.shape for sentence in tokens_cpu: str_tokens: List[str] = [ vocab.get_token_from_index(int(token)) for token in sentence if token != 0 ] #skip padding doc = Doc(self.nlp.vocab, words=str_tokens) self.nlp.pipeline[1][1](doc) #word pieces self.nlp.pipeline[2][1](doc) #run transformer on wordpieces #add padding back in #embedded = torch.from_numpy(cupy.asnumpy(doc.tensor)).to(device) # shape (str_tokens, output dim) embedded = from_dlpack( doc.tensor.toDlpack()) # shape (str_tokens, output dim) assert embedded.shape == (len(str_tokens), self.get_output_dim()) if seq_len - len(str_tokens) > 0: padded = torch.zeros(seq_len - len(str_tokens), self.get_output_dim()) embedded = torch.cat([embedded, padded], dim=0) embedded_sentences.append(embedded) return torch.stack(embedded_sentences, dim=0)
def test_language_init(name): meta = {"lang": "en", "name": "test", "pipeline": []} nlp = PyTT_Language(meta=meta, pytt_name=name) assert nlp.lang == "en" assert nlp.meta["lang"] == "en" assert nlp.meta["lang_factory"] == PyTT_Language.lang_factory_name assert nlp.vocab.lang == "en" # Make sure we really have the EnglishDefaults here assert nlp.Defaults.lex_attr_getters[LANG](None) == "en" # Test requirements package = f"{about.__title__}>={about.__version__}" assert package in nlp.meta["requirements"]
def test_language_to_from_disk(nlp, name): doc = nlp("hello world") assert is_valid_tensor(doc.tensor) with make_tempdir() as tempdir: nlp.to_disk(tempdir) new_nlp = PyTT_Language() new_nlp.add_pipe(new_nlp.create_pipe("sentencizer")) wordpiecer = PyTT_WordPiecer(new_nlp.vocab, pytt_name=name) tok2vec = PyTT_TokenVectorEncoder(new_nlp.vocab, pytt_name=name) new_nlp.add_pipe(wordpiecer) new_nlp.add_pipe(tok2vec) new_nlp.from_disk(tempdir) assert new_nlp.pipe_names == nlp.pipe_names new_doc = new_nlp("hello world") assert is_valid_tensor(new_doc.tensor) assert_equal(doc.tensor, new_doc.tensor)
def test_language_wordpiece_tok2vec_to_from_bytes(nlp, name): doc = nlp("hello world") assert is_valid_tensor(doc.tensor) nlp2 = PyTT_Language() nlp2.add_pipe(nlp2.create_pipe("sentencizer")) nlp2.add_pipe(PyTT_WordPiecer(nlp.vocab)) nlp2.add_pipe(PyTT_TokenVectorEncoder(nlp.vocab)) with pytest.raises(ValueError): new_doc = nlp2("hello world") nlp2.from_bytes(nlp.to_bytes()) new_doc = nlp2("hello world") assert is_valid_tensor(new_doc.tensor) assert new_doc._.pytt_word_pieces is not None
def test_language_wordpiece_to_from_bytes(name): nlp = PyTT_Language() nlp.add_pipe(nlp.create_pipe("sentencizer")) wordpiecer = PyTT_WordPiecer.from_pretrained(nlp.vocab, pytt_name=name) nlp.add_pipe(wordpiecer) doc = nlp("hello world") assert doc._.pytt_word_pieces is not None nlp2 = PyTT_Language() nlp2.add_pipe(nlp.create_pipe("sentencizer")) nlp2.add_pipe(PyTT_WordPiecer(nlp2.vocab)) with pytest.raises(ValueError): new_doc = nlp2("hello world") nlp2.from_bytes(nlp.to_bytes()) new_doc = nlp2("hello world") assert new_doc._.pytt_word_pieces is not None
from spacy_pytorch_transformers import PyTT_Language, PyTT_WordPiecer, PyTT_TokenVectorEncoder from pathlib import Path pytorch_path = str(Path.home() / "pytorch-rubert") spacy_path = str(Path.home() / "spacy-rubert") name = "ru_pytt_rubert_cased" nlp = PyTT_Language(pytt_name=name, meta={"lang": "ru"}) nlp.add_pipe(nlp.create_pipe("sentencizer")) nlp.add_pipe(PyTT_WordPiecer.from_pretrained(nlp.vocab, pytorch_path)) nlp.add_pipe(PyTT_TokenVectorEncoder.from_pretrained(nlp.vocab, pytorch_path)) print(nlp.pipe_names) nlp.to_disk(spacy_path)
class BatchedSpacyTokenToVec(TokenToVec): def __init__(self, model_name: str, model_path: Optional[str] = None) -> None: """ Loads a model_name (e.g. en_pytt_xlnetbasecased_lg) or a combination of model name and local model path (e.g. xlnet-large-cased and /local/mlinde/xlnet-large-cased) see https://github.com/explosion/spacy-pytorch-transformers#loading-models-from-a-path for how to prepare a model :param model_name: :param model_path: """ super().__init__() with SwitchDefaultTensor(): if model_path: self.nlp = PyTT_Language(pytt_name=model_name, meta={"lang": "en"}) self.nlp.add_pipe(self.nlp.create_pipe("sentencizer")) self.nlp.add_pipe( PyTT_WordPiecer.from_pretrained(self.nlp.vocab, model_name)) self.nlp.add_pipe( PyTT_TokenVectorEncoder.from_pretrained( self.nlp.vocab, model_path)) else: self.nlp = spacy.load(model_name) if model_name not in NAME_TO_DIM: raise ValueError("Model name is unknown, I know " + str(list(NAME_TO_DIM.keys()))) self.output_dim = NAME_TO_DIM[model_name] def get_output_dim(self) -> int: return self.output_dim def embed(self, vocab: Vocabulary, tokens: torch.Tensor) -> torch.Tensor: """ Idea: reconstruct string tokens from token ids -> feed to spacy -> return tensors :param vocab: :param tokens: :return: """ with SwitchDefaultTensor(): with torch.autograd.no_grad(): embedded_sentences = [] tokens_cpu = tokens.cpu() batch_size, seq_len = tokens.shape sents = [] for sentence in tokens_cpu: str_tokens: List[str] = [ vocab.get_token_from_index(int(token)) for token in sentence if token != 0 ] #skip padding sents.append(str_tokens) doc = make_doc(self.nlp.vocab, sents) self.nlp.pipeline[1][1](doc) #word pieces self.nlp.pipeline[2][1](doc) #run transformer on wordpieces #Now iterate over sentences in correct order and cut out the correct tensor + pad it for sent, str_tokens in zip(doc.sents, sents): #add padding back in embedded = from_dlpack(sent.tensor.toDlpack() ) # shape (str_tokens, output dim) if seq_len - len(str_tokens) > 0: padded = torch.zeros(seq_len - len(str_tokens), self.get_output_dim()) embedded = torch.cat([embedded, padded], dim=0) embedded_sentences.append(embedded) return torch.stack(embedded_sentences, dim=0)
print('N devices: {}'.format(torch.cuda.device_count())) print(torch.cuda.get_device_name(0)) print("is available? {}".format(torch.cuda.is_available())) ################### # Configure Spacy for vanilla BERT start_time = time.time() print("Spacy GPU? {}".format(is_using_gpu)) if is_using_gpu: torch.set_default_tensor_type("torch.cuda.FloatTensor") name = "scibert-scivocab-uncased" path = "/project2/jevans/brendan/pretrained_transformers/scibert-scivocab-uncased" nlp = PyTT_Language(pytt_name=name, meta={"lang": "en"}) nlp.add_pipe(nlp.create_pipe("sentencizer")) nlp.add_pipe(PyTT_WordPiecer.from_pretrained(nlp.vocab, path)) nlp.add_pipe(PyTT_TokenVectorEncoder.from_pretrained(nlp.vocab, path)) end_time = time.time() print('elapsed (s): {}'.format(end_time - start_time)) #################### # Encode abstracts from the validation set start_time = time.time() chunk_filepath = '/project2/jevans/brendan/pubmed_data_processing/validation_sets/' chunk_filename = chunk_filepath + 'jneurophysiol_vs_neuroimage.csv' embedding_subchunk_size = 10000 # do 10000 abstracts at a time
def main(path, name="bert-base-uncased", lang="en"): msg = Printer() msg.info(f"Creating model for '{name}' ({lang})") with msg.loading(f"Setting up the pipeline..."): nlp = PyTT_Language(pytt_name=name, meta={"lang": lang}) nlp.add_pipe(nlp.create_pipe("sentencizer")) nlp.add_pipe(PyTT_WordPiecer.from_pretrained(nlp.vocab, name)) nlp.add_pipe(PyTT_TokenVectorEncoder.from_pretrained(nlp.vocab, name)) msg.good("Initialized the model pipeline") nlp.to_disk(path) msg.good(f"Saved '{name}' ({lang})") msg.text(f"Pipeline: {nlp.pipe_names}") msg.text(f"Location: {path}") with msg.loading("Verifying model loads..."): nlp.from_disk(path) msg.good("Model loads!")