def setup_class(self): self.use_gpu = torch.cuda.is_available() self.test_dir = Path(tempfile.mkdtemp()) self.base_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True, cache_dir=self.test_dir) self.rust_tokenizer = PyBertTokenizer( get_from_cache(self.base_tokenizer.pretrained_vocab_files_map['vocab_file']['bert-base-uncased']), do_lower_case=True, strip_accents=True) self.model = BertForSequenceClassification.from_pretrained('bert-base-uncased', output_attentions=False).eval() if self.use_gpu: self.model.cuda() self.sentence_list = ['For instance, on the planet Earth, man had always assumed that he was more intelligent ' 'than dolphins because he had achieved so much—the wheel, New York, wars and so on—whilst' ' all the dolphins had ever done was muck about in the water having a good time. But ' 'conversely, the dolphins had always believed that they were far more intelligent than ' 'man—for precisely the same reasons.'] * 64 # Pre-allocate GPU memory tokens_list = [self.base_tokenizer.tokenize(sentence) for sentence in self.sentence_list] features = [self.base_tokenizer.convert_tokens_to_ids(tokens) for tokens in tokens_list] features = [self.base_tokenizer.prepare_for_model(input, None, add_special_tokens=True, max_length=128) for input in features] all_input_ids = torch.tensor([f['input_ids'] for f in features], dtype=torch.long) if self.use_gpu: all_input_ids = all_input_ids.cuda() with torch.no_grad(): _ = self.model(all_input_ids)[0].cpu().numpy()
def test_tokenization_distilbert(self): # Given self.base_tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased', do_lower_case=True, cache_dir=self.test_dir) self.rust_tokenizer = PyBertTokenizer( get_from_cache(self.base_tokenizer.pretrained_vocab_files_map['vocab_file']['distilbert-base-uncased']), do_lower_case=True) output_baseline = [] for example in self.examples: output_baseline.append(self.base_tokenizer.encode_plus(example.text_a, text_pair=example.text_b, add_special_tokens=True, return_overflowing_tokens=True, return_special_tokens_mask=True, max_length=128)) # When output_rust = self.rust_tokenizer.encode_pair_list( [(example.text_a, example.text_b) for example in self.examples], max_len=128, truncation_strategy='longest_first', stride=0) # Then for rust, baseline in zip(output_rust, output_baseline): assert (rust.token_ids == baseline['input_ids']) assert (rust.segment_ids == baseline['token_type_ids']) assert (rust.special_tokens_mask == baseline['special_tokens_mask'])
def setup_class(self): self.processor = Sst2Processor() self.test_dir = Path(tempfile.mkdtemp()) sst2_url = 'https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FSST-2.zip?alt=media&token=aabc5f6b-e466-44a2-b9b4-cf6337f84ac8' contents = requests.get(sst2_url) (self.test_dir / 'SST-2.zip').open('wb').write(contents.content) with ZipFile(self.test_dir / 'SST-2.zip', 'r') as zipObj: zipObj.extractall(self.test_dir) self.examples = self.processor.get_train_examples(self.test_dir / 'SST-2') self.base_tokenizer = BertTokenizer.from_pretrained( 'bert-base-uncased', do_lower_case=True, cache_dir=self.test_dir) self.rust_tokenizer = PyBertTokenizer(get_from_cache( self.base_tokenizer.pretrained_vocab_files_map['vocab_file'] ['bert-base-uncased']), do_lower_case=True)
def test_tokenization_distilbert(self): # Given self.base_tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-cased', do_lower_case=False, cache_dir=self.test_dir) self.rust_tokenizer = PyBertTokenizer( get_from_cache(self.base_tokenizer.pretrained_vocab_files_map['vocab_file']['distilbert-base-cased']), do_lower_case=False, strip_accents=False) output_baseline = [] for example in self.examples: output_baseline.append(self.base_tokenizer.encode_plus(example.text_a, text_pair=example.text_b, add_special_tokens=True, return_overflowing_tokens=True, return_special_tokens_mask=True, max_length=128)) # When output_rust = self.rust_tokenizer.encode_pair_list( [(example.text_a, example.text_b) for example in self.examples], max_len=128, truncation_strategy='longest_first', stride=0) # Then for idx, (rust, baseline) in enumerate(zip(output_rust, output_baseline)): assert rust.token_ids == baseline[ 'input_ids'], f'Difference in tokenization for {self.rust_tokenizer.__class__}: \n ' \ f'Sentence a: {self.examples[idx].text_a} \n' \ f'Sentence b: {self.examples[idx].text_b} \n' \ f'Token mismatch: {self.get_token_diff(rust.token_ids, baseline["input_ids"])} \n' \ f'Rust: {rust.token_ids} \n' \ f'Python {baseline["input_ids"]}'
def setup_rust_tokenizer(self): self.rust_tokenizer = PyBertTokenizer(get_from_cache( self.base_tokenizer.pretrained_vocab_files_map['vocab_file'] ['bert-base-uncased']), do_lower_case=True)
class TestBenchmarkBert: def setup_class(self): self.processor = Sst2Processor() self.test_dir = Path(tempfile.mkdtemp()) sst2_url = 'https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FSST-2.zip?alt=media&token=aabc5f6b-e466-44a2-b9b4-cf6337f84ac8' contents = requests.get(sst2_url) (self.test_dir / 'SST-2.zip').open('wb').write(contents.content) with ZipFile(self.test_dir / 'SST-2.zip', 'r') as zipObj: zipObj.extractall(self.test_dir) self.examples = self.processor.get_train_examples(self.test_dir / 'SST-2') self.base_tokenizer = BertTokenizer.from_pretrained( 'bert-base-uncased', do_lower_case=True, cache_dir=self.test_dir) self.rust_tokenizer = PyBertTokenizer(get_from_cache( self.base_tokenizer.pretrained_vocab_files_map['vocab_file'] ['bert-base-uncased']), do_lower_case=True) def setup_python_tokenizer(self): self.base_tokenizer = BertTokenizer.from_pretrained( 'bert-base-uncased', do_lower_case=True, cache_dir=self.test_dir) def setup_rust_tokenizer(self): self.rust_tokenizer = PyBertTokenizer(get_from_cache( self.base_tokenizer.pretrained_vocab_files_map['vocab_file'] ['bert-base-uncased']), do_lower_case=True) def python_bert_tokenizer(self): output_baseline = [] for example in self.examples: output_baseline.append( self.base_tokenizer.encode_plus( example.text_a, add_special_tokens=True, return_overflowing_tokens=True, return_special_tokens_mask=True, max_length=128)) def rust_bert_tokenizer_single_threaded(self): output_baseline = [] for example in self.examples: output_baseline.append( self.rust_tokenizer.encode(example.text_a, max_len=128, truncation_strategy='longest_first', stride=0)) def rust_bert_tokenizer_multi_threaded(self): self.rust_tokenizer.encode_list( [example.text_a for example in self.examples], max_len=128, truncation_strategy='longest_first', stride=0) def test_python_bert_tokenizer_single_threaded(self, benchmark): benchmark.pedantic(self.python_bert_tokenizer, setup=self.setup_python_tokenizer, iterations=1, rounds=3) def test_rust_bert_tokenizer_single_threaded(self, benchmark): benchmark.pedantic(self.rust_bert_tokenizer_single_threaded, setup=self.setup_rust_tokenizer, iterations=1, rounds=3) def test_rust_bert_tokenizer_multi_threaded(self, benchmark): benchmark.pedantic(self.rust_bert_tokenizer_multi_threaded, setup=self.setup_rust_tokenizer, iterations=1, rounds=3)
class TestBenchmarkDistilBert: def setup_class(self): self.use_gpu = torch.cuda.is_available() self.test_dir = Path(tempfile.mkdtemp()) self.base_tokenizer = DistilBertTokenizer.from_pretrained( 'distilbert-base-uncased', do_lower_case=True, cache_dir=self.test_dir) self.rust_tokenizer = PyBertTokenizer(get_from_cache( self.base_tokenizer.pretrained_vocab_files_map['vocab_file'] ['distilbert-base-uncased']), do_lower_case=True) self.model = DistilBertForSequenceClassification.from_pretrained( 'distilbert-base-uncased', output_attentions=False).eval() if self.use_gpu: self.model.cuda() self.sentence_list = [ 'For instance, on the planet Earth, man had always assumed that he was more intelligent ' 'than dolphins because he had achieved so much—the wheel, New York, wars and so on—whilst' ' all the dolphins had ever done was muck about in the water having a good time. But ' 'conversely, the dolphins had always believed that they were far more intelligent than ' 'man—for precisely the same reasons.' ] * 64 # Pre-allocate GPU memory tokens_list = [ self.base_tokenizer.tokenize(sentence) for sentence in self.sentence_list ] features = [ self.base_tokenizer.convert_tokens_to_ids(tokens) for tokens in tokens_list ] features = [ self.base_tokenizer.prepare_for_model(input, None, add_special_tokens=True, max_length=128) for input in features ] all_input_ids = torch.tensor([f['input_ids'] for f in features], dtype=torch.long) if self.use_gpu: all_input_ids = all_input_ids.cuda() with torch.no_grad(): _ = self.model(all_input_ids)[0].cpu().numpy() def setup_base_tokenizer(self): self.base_tokenizer = DistilBertTokenizer.from_pretrained( 'distilbert-base-uncased', do_lower_case=True, cache_dir=self.test_dir) def setup_rust_tokenizer(self): self.rust_tokenizer = PyBertTokenizer(get_from_cache( self.base_tokenizer.pretrained_vocab_files_map['vocab_file'] ['distilbert-base-uncased']), do_lower_case=True) def baseline_batch(self): tokens_list = [ self.base_tokenizer.tokenize(sentence) for sentence in self.sentence_list ] features = [ self.base_tokenizer.convert_tokens_to_ids(tokens) for tokens in tokens_list ] features = [ self.base_tokenizer.prepare_for_model(input, None, add_special_tokens=True, max_length=128) for input in features ] all_input_ids = torch.tensor([f['input_ids'] for f in features], dtype=torch.long) if self.use_gpu: all_input_ids = all_input_ids.cuda() with torch.no_grad(): output = self.model(all_input_ids)[0].cpu().numpy() return output def rust_batch_single_threaded(self): features = [ self.rust_tokenizer.encode(sentence, max_len=128, truncation_strategy='longest_first', stride=0) for sentence in self.sentence_list ] all_input_ids = torch.tensor([f.token_ids for f in features], dtype=torch.long) if self.use_gpu: all_input_ids = all_input_ids.cuda() with torch.no_grad(): output = self.model(all_input_ids)[0].cpu().numpy() return output def rust_batch_multi_threaded(self): features = self.rust_tokenizer.encode_list( self.sentence_list, max_len=128, truncation_strategy='longest_first', stride=0) all_input_ids = torch.tensor([f.token_ids for f in features], dtype=torch.long) if self.use_gpu: all_input_ids = all_input_ids.cuda() with torch.no_grad(): output = self.model(all_input_ids)[0].cpu().numpy() return output def test_distilbert_baseline(self): values = [] for i in range(10): self.setup_base_tokenizer() t0 = timer() self.baseline_batch() t1 = timer() values.append((t1 - t0) * 1000) mean = sum(values) / len(values) std_dev = math.sqrt(sum([(value - mean)**2 for value in values])) / (len(values) - 1) print(f'baseline - mean: {mean:.2f}, std. dev: {std_dev:.2f}') def test_distilbert_rust_single_threaded(self): values = [] for i in range(10): self.setup_rust_tokenizer() t0 = timer() self.rust_batch_single_threaded() t1 = timer() values.append((t1 - t0) * 1000) mean = sum(values) / len(values) std_dev = math.sqrt(sum([(value - mean)**2 for value in values])) / (len(values) - 1) print( f'rust single thread - mean: {mean:.2f}, std. dev: {std_dev:.2f}') def test_distilbert_rust_multi_threaded(self): values = [] for i in range(10): self.setup_rust_tokenizer() t0 = timer() self.rust_batch_multi_threaded() t1 = timer() values.append((t1 - t0) * 1000) mean = sum(values) / len(values) std_dev = math.sqrt(sum([(value - mean)**2 for value in values])) / (len(values) - 1) print( f'rust multi threaded - mean: {mean:.2f}, std. dev: {std_dev:.2f}') def teardown_class(self): self.model = None self.base_tokenizer = None self.rust_tokenizer = None gc.collect() torch.cuda.empty_cache()
class TestTokenizationSST2: def setup_class(self): self.processor = QnliProcessor() # Note: these tests do not download automatically test datasets. Please download them manually and update your # environment variables accordingly self.examples = self.processor.get_train_examples(os.environ["QNLI_PATH"]) self.test_dir = Path(tempfile.mkdtemp()) def test_tokenization_bert(self): # Given self.base_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True, cache_dir=self.test_dir) self.rust_tokenizer = PyBertTokenizer( get_from_cache(self.base_tokenizer.pretrained_vocab_files_map['vocab_file']['bert-base-uncased']), do_lower_case=True) output_baseline = [] for example in self.examples: output_baseline.append(self.base_tokenizer.encode_plus(example.text_a, text_pair=example.text_b, add_special_tokens=True, return_overflowing_tokens=True, return_special_tokens_mask=True, max_length=128)) # When output_rust = self.rust_tokenizer.encode_pair_list( [(example.text_a, example.text_b) for example in self.examples], max_len=128, truncation_strategy='longest_first', stride=0) # Then for rust, baseline in zip(output_rust, output_baseline): assert (rust.token_ids == baseline['input_ids']) assert (rust.segment_ids == baseline['token_type_ids']) assert (rust.special_tokens_mask == baseline['special_tokens_mask']) def test_tokenization_distilbert(self): # Given self.base_tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased', do_lower_case=True, cache_dir=self.test_dir) self.rust_tokenizer = PyBertTokenizer( get_from_cache(self.base_tokenizer.pretrained_vocab_files_map['vocab_file']['distilbert-base-uncased']), do_lower_case=True) output_baseline = [] for example in self.examples: output_baseline.append(self.base_tokenizer.encode_plus(example.text_a, text_pair=example.text_b, add_special_tokens=True, return_overflowing_tokens=True, return_special_tokens_mask=True, max_length=128)) # When output_rust = self.rust_tokenizer.encode_pair_list( [(example.text_a, example.text_b) for example in self.examples], max_len=128, truncation_strategy='longest_first', stride=0) # Then for rust, baseline in zip(output_rust, output_baseline): assert (rust.token_ids == baseline['input_ids']) assert (rust.segment_ids == baseline['token_type_ids']) assert (rust.special_tokens_mask == baseline['special_tokens_mask'])