def _get_scorer_and_corpus_eos(): ctxs = [mx.cpu()] model, vocab, tokenizer = get_pretrained(ctxs, 'bert-base-en-uncased') scorer_mx = MLMScorer(model, vocab, tokenizer, ctxs, eos=True, wwm=False) model, vocab, tokenizer = get_pretrained(ctxs, 'bert-base-uncased') scorer_pt = MLMScorerPT(model, vocab, tokenizer, ctxs, eos=True, wwm=False) corpus = Corpus.from_dict({'utt': {'ref': "I am Sam"}}) return scorer_mx, scorer_pt, corpus
def test_get_pretrained(): # bert-base-en-uncased model, vocab, tokenizer = get_pretrained([mx.cpu()], 'bert-base-en-uncased') # Check the model assert isinstance(model, nlp.model.BERTModel) assert len(model.encoder.transformer_cells) == 12 assert pytest.approx(model.word_embed[0].params['bertmodel0_word_embed_embedding0_weight']._data[0][0,0].asscalar()) == -0.0424806065 # Check the vocab unk_idx = vocab.token_to_idx[vocab.unknown_token] assert vocab.token_to_idx['test'] != unk_idx assert vocab.token_to_idx['Test'] == unk_idx # Check the tokenizer assert tuple(tokenizer("The man jumped up, put his basket on Philammon's head")) == ('the', 'man', 'jumped', 'up', ',', 'put', 'his', 'basket', 'on', 'phil', '##am', '##mon', "'", 's', 'head') # bert-base-en-uncased-owt model, vocab_new, tokenizer = get_pretrained([mx.cpu()], 'bert-base-en-uncased-owt') # Check the model assert pytest.approx(model.word_embed[0].params['bertmodel1_word_embed_embedding0_weight']._data[0][0,0].asscalar()) == -0.0361938476 # Check the vocab assert len(vocab_new) == len(vocab) # Check the tokenizer assert tuple(tokenizer("The man jumped up, put his basket on Philammon's head")) == ('the', 'man', 'jumped', 'up', ',', 'put', 'his', 'basket', 'on', 'phil', '##am', '##mon', "'", 's', 'head') # bert-large-en-cased model, vocab, tokenizer = get_pretrained([mx.cpu()], 'bert-large-en-cased') # Check the model assert isinstance(model, nlp.model.BERTModel) assert len(model.encoder.transformer_cells) == 24 assert pytest.approx(model.word_embed[0].params['bertmodel2_word_embed_embedding0_weight']._data[0][0,0].asscalar()) == 0.0116166482 # Check the vocab unk_idx = vocab.token_to_idx[vocab.unknown_token] assert vocab.token_to_idx['test'] != unk_idx assert vocab.token_to_idx['Test'] != unk_idx assert vocab.token_to_idx['Test'] != vocab.token_to_idx['test'] # Check the tokenizer assert tuple(tokenizer("The man jumped up, put his basket on Philammon's head")) == ('The', 'man', 'jumped', 'up', ',', 'put', 'his', 'basket', 'on', 'Phil', '##am', '##mon', "'", 's', 'head') # bert-base-multi-cased model, vocab, tokenizer = get_pretrained([mx.cpu()], 'bert-base-multi-cased') # Check the model assert isinstance(model, nlp.model.BERTModel) assert len(model.encoder.transformer_cells) == 12 assert pytest.approx(model.word_embed[0].params['bertmodel3_word_embed_embedding0_weight']._data[0][0,0].asscalar()) == 0.0518957935 # Check the vocab unk_idx = vocab.token_to_idx[vocab.unknown_token] assert vocab.token_to_idx['Test'] != unk_idx assert vocab.token_to_idx['これは'] != unk_idx # Check the tokenizer assert tuple(tokenizer("これは Test ですよ。")) == ('これは', 'Test', 'で', '##す', '##よ', '。')
def test_mlmscorer_score_sentences(): TEST_CASES = ( # README examples ('bert-base-en-cased', MLMScorer, (None, -6.126666069030762, -5.50140380859375, -0.7823182344436646, None)), ('bert-base-cased', MLMScorerPT, (None, -6.126738548278809, -5.501765727996826, -0.782496988773346, None)), ('gpt2-117m-en-cased', LMScorer, (-8.293947219848633, -6.387561798095703, -1.3138668537139893)), # etc. ('albert-base-v2', MLMScorerPT, (None, -16.480087280273438, -12.897505760192871, -4.277405738830566, None)), ('distilbert-base-cased', MLMScorerPT, (None, -5.1874895095825195, -6.390861511230469, -3.8225560188293457, None)), ) for name, scorer_cls, expected_scores in TEST_CASES: model, vocab, tokenizer = get_pretrained([mx.cpu()], name) scorer = scorer_cls(model, vocab, tokenizer, [mx.cpu()]) scores = scorer.score_sentences(["Hello world!"], per_token=True)[0] expected_total = 0 for score, expected_score in zip(scores, expected_scores): if score is None and expected_score is None: continue assert pytest.approx(score, abs=0.0001) == expected_score expected_total += expected_score score_total = scorer.score_sentences(["Hello world!"], per_token=False)[0] assert pytest.approx(score_total, abs=0.0001) == expected_total
class Server(BaseHTTPRequestHandler): ctxs = [mx.gpu()] model, vocab, tokenizer = get_pretrained(ctxs, 'bert-base-en-cased') scorer = MLMScorer(model, vocab, tokenizer, ctxs) def _set_headers(self, content_type): self.send_response(200) self.send_header('Content-type', content_type) self.end_headers() @staticmethod def _html(message): """This just generates an HTML document that includes `message` in the body. Override, or re-write this do do more interesting stuff. """ content = f"<html><body><h1>{message}</h1></body></html>" return content.encode('utf8') # NOTE: must return a bytes object! def do_GET(self): self._set_headers('text/html') self.wfile.write(self._html('hi')) def do_HEAD(self): self._set_headers('text/html') def do_POST(self): print('received request') ctype, pdict = cgi.parse_header(self.headers.get('content-type')) # refuse to receive non-json content if ctype != 'application/json' or self.path != '/score': self.send_response(400) self.end_headers() return # read the message and convert it into a python dictionary length = int(self.headers.get('content-length')) request = json.loads(self.rfile.read(length)) sentences = request['texts'] corpus = Corpus.from_text(sentences.toList()) # Sentences are encoded by calling model.encode() print(f'scoring {len(sentences)} sentences') scores = self.scorer.score(corpus) print(f'done') response = {'id': request['id'], 'result': scores, 'status': 200} self._set_headers('content-type') self.wfile.write(json.dumps(response).encode('utf8'))
def __init__(self, model_name_or_path, gpu_batch_size=1, gpu_id=0): mx_device = [mx.gpu(gpu_id)] self.scorer = MLMScorerPT( *get_pretrained(mx_device, model_name_or_path), mx_device) self.gpu_batch_size = gpu_batch_size
class Server(BaseHTTPRequestHandler): # sentence_transformers sbert_model = SentenceTransformer('distilbert-base-nli-stsb-mean-tokens') # gpt with pytorch_pretrained_bert # torch.cuda.set_device(0) # model = OpenAIGPTLMHeadModel.from_pretrained('openai-gpt') # model.eval() # tokenizer = OpenAIGPTTokenizer.from_pretrained('openai-gpt') # mlms scorers ctxs = [mx.gpu()] # mlms_model, vocab, tokenizer = get_pretrained(ctxs, 'roberta-base-en-cased') # scorer = MLMScorer(mlms_model, vocab, tokenizer, ctxs) # mlms_model, vocab, tokenizer = get_pretrained(ctxs, 'distilbert-base-cased') # scorer = MLMScorerPT(mlms_model, vocab, tokenizer, ctxs) mlms_model, vocab, tokenizer = get_pretrained(ctxs, 'gpt2-117m-en-cased') scorer = LMScorer(mlms_model, vocab, tokenizer, ctxs) def _set_headers(self, content_type): self.send_response(200) self.send_header('Content-type', content_type) self.end_headers() @staticmethod def _html(message): """This just generates an HTML document that includes `message` in the body. Override, or re-write this do do more interesting stuff. """ content = f"<html><body><h1>{message}</h1></body></html>" return content.encode('utf8') # NOTE: must return a bytes object! def do_GET(self): self._set_headers('text/html') self.wfile.write(self._html('hi')) def do_HEAD(self): self._set_headers('text/html') def do_POST(self): print('received request') ctype, pdict = cgi.parse_header(self.headers.get('content-type')) # refuse to receive non-json content if ctype != 'application/json': self.send_response(400) self.end_headers() return # read the message and convert it into a python dictionary length = int(self.headers.get('content-length')) request = json.loads(self.rfile.read(length)) sentences = request['texts'] if self.path == '/encode': # Encode sentences using Sentence scoring model print(f'encoding {len(sentences)} sentences') embeddings = self.sbert_model.encode(sentences) print(f'done') response = { 'id': request['id'], 'result': embeddings.tolist(), 'status': 200 } self._set_headers('content-type') self.wfile.write(json.dumps(response).encode('utf8')) elif self.path == '/score': print(f'scoring {len(sentences)} sentences') scores = self.model_score(sentences) print(f'done') response = { 'id': request['id'], 'result': [scores[0]], 'status': 200 } self._set_headers('content-type') self.wfile.write(json.dumps(response).encode('utf8')) # uses mlms scorer def model_score(self, sentences): corpus = Corpus.from_text(sentences) return self.scorer.score(corpus, 1.0, 50)
from mlm.scorers import MLMScorer, MLMScorerPT, LMScorer from mlm.models import get_pretrained import mxnet as mx import torch from transformers import AutoModel, AutoTokenizer import numpy as np ctxs = [mx.cpu()] # or, e.g., [mx.gpu(0), mx.gpu(1)] sentence = 'confirms HTTPURL via @USER :cry:' print('Checking original MLM library..') # MXNet MLMs (use names from mlm.models.SUPPORTED_MLMS) model, vocab, tokenizer = get_pretrained(ctxs, 'bert-base-en-cased') #print(type(vocab).__name__) scorer = MLMScorer(model, vocab, tokenizer, ctxs) print(scorer.score_sentences([sentence])) # >> [-12.410664200782776] print(scorer.score_sentences([sentence], per_token=True)) # >> [[None, -6.126736640930176, -5.501412391662598, -0.7825151681900024, None]] print('Done. Checking extension..') # Load the AutoTokenizer with a normalization mode if the input Tweet is raw tokenizer = AutoTokenizer.from_pretrained("vinai/bertweet-base", normalization=True) bertweet, vocab, tokenizer = get_pretrained(ctxs, 'vinai/bertweet-base-en-cased') #print(BERTVocab(tokenizer.vocab_file))
def test_get_pretrained(): # MXNet: bert-base-en-uncased model, vocab, tokenizer = get_pretrained([mx.cpu()], 'bert-base-en-uncased') # Check the model assert isinstance(model, nlp.model.BERTModel) assert len(model.encoder.transformer_cells) == 12 unk_idx = vocab.token_to_idx[vocab.unknown_token] assert pytest.approx( model.word_embed[0].params['bertmodel0_word_embed_embedding0_weight']. _data[0][unk_idx, 0].asscalar()) == -0.0424806065 # Check the vocab assert vocab.token_to_idx['test'] != unk_idx assert vocab.token_to_idx['Test'] == unk_idx # Check the tokenizer assert tuple( tokenizer("The man jumped up, put his basket on Philammon's head") ) == ('the', 'man', 'jumped', 'up', ',', 'put', 'his', 'basket', 'on', 'phil', '##am', '##mon', "'", 's', 'head') # PyTorch: bert-base-en-uncased model, _, tokenizer = get_pretrained([mx.cpu()], 'bert-base-uncased') # Check the model assert isinstance(model, BertForMaskedLMOptimized) assert len(model.bert.encoder.layer) == 12 unk_idx = tokenizer.unk_token_id assert pytest.approx( model.bert.embeddings.word_embeddings.parameters().__next__()[ unk_idx, 0].detach().numpy().item()) == -0.0424806065 # Check the vocab assert tokenizer.convert_tokens_to_ids('test') != unk_idx assert tokenizer.convert_tokens_to_ids('Test') == unk_idx # Check the tokenizer assert tuple( tokenizer.tokenize( "The man jumped up, put his basket on Philammon's head")) == ( 'the', 'man', 'jumped', 'up', ',', 'put', 'his', 'basket', 'on', 'phil', '##am', '##mon', "'", 's', 'head') # MXNet: bert-base-en-uncased-owt model, vocab_new, tokenizer = get_pretrained([mx.cpu()], 'bert-base-en-uncased-owt') # Check the model assert pytest.approx( model.word_embed[0].params['bertmodel1_word_embed_embedding0_weight']. _data[0][0, 0].asscalar()) == -0.0361938476 # Check the vocab assert len(vocab_new) == len(vocab) # Check the tokenizer assert tuple( tokenizer("The man jumped up, put his basket on Philammon's head") ) == ('the', 'man', 'jumped', 'up', ',', 'put', 'his', 'basket', 'on', 'phil', '##am', '##mon', "'", 's', 'head') # MXNet: bert-large-en-cased model, vocab, tokenizer = get_pretrained([mx.cpu()], 'bert-large-en-cased') # Check the model assert isinstance(model, nlp.model.BERTModel) assert len(model.encoder.transformer_cells) == 24 assert pytest.approx( model.word_embed[0].params['bertmodel2_word_embed_embedding0_weight']. _data[0][0, 0].asscalar()) == 0.0116166482 # Check the vocab unk_idx = vocab.token_to_idx[vocab.unknown_token] assert vocab.token_to_idx['test'] != unk_idx assert vocab.token_to_idx['Test'] != unk_idx assert vocab.token_to_idx['Test'] != vocab.token_to_idx['test'] # Check the tokenizer assert tuple( tokenizer("The man jumped up, put his basket on Philammon's head") ) == ('The', 'man', 'jumped', 'up', ',', 'put', 'his', 'basket', 'on', 'Phil', '##am', '##mon', "'", 's', 'head') # MXNet: bert-base-multi-cased model, vocab, tokenizer = get_pretrained([mx.cpu()], 'bert-base-multi-cased') # Check the model assert isinstance(model, nlp.model.BERTModel) assert len(model.encoder.transformer_cells) == 12 assert pytest.approx( model.word_embed[0].params['bertmodel3_word_embed_embedding0_weight']. _data[0][0, 0].asscalar()) == 0.0518957935 # Check the vocab unk_idx = vocab.token_to_idx[vocab.unknown_token] assert vocab.token_to_idx['Test'] != unk_idx assert vocab.token_to_idx['これは'] != unk_idx # Check the tokenizer assert tuple(tokenizer("これはTestですよ。")) == ('これは', '##T', '##est', '##で', '##す', '##よ', '。')