def test_tokenization_sentence_piece(self): # Given self.base_tokenizer = sentencepiece.SentencePieceProcessor() self.base_tokenizer.Load(str(self.test_dir / 'spiece.model')) self.rust_tokenizer = PySentencePieceTokenizer(str(self.test_dir / 'spiece.model'), do_lower_case=False) output_baseline = [] for example in self.examples: output_baseline.append( self.base_tokenizer.EncodeAsIds(example.text_a)) # When # Note: the original sentence piece tokenizer strips trailing spaces output_rust = self.rust_tokenizer.encode_list( [example.text_a.strip() for example in self.examples], max_len=256, truncation_strategy='longest_first', stride=0) # Then for idx, (rust, baseline) in enumerate(zip(output_rust, output_baseline)): if rust.token_ids != baseline: assert sum(self.base_tokenizer.get_score(baseline)) == \ sum(self.base_tokenizer.get_score(rust.token_ids)), \ f'Difference in tokenization for {self.rust_tokenizer.__class__}: \n ' \ f'Sentence a: {self.examples[idx].text_a} \n' \ f'Sentence b: {self.examples[idx].text_b} \n' \ f'Token mismatch: {self.get_token_diff_sentence_piece(rust.token_ids, baseline)} \n' \ f'Rust: {rust.token_ids} \n' \ f'Python {baseline}'
def setup_class(self): self.processor = Sst2Processor() self.test_dir = Path(tempfile.mkdtemp()) sst2_url = 'https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FSST-2.zip?alt=media&token=aabc5f6b-e466-44a2-b9b4-cf6337f84ac8' contents = requests.get(sst2_url) (self.test_dir / 'SST-2.zip').open('wb').write(contents.content) with ZipFile(self.test_dir / 'SST-2.zip', 'r') as zipObj: zipObj.extractall(self.test_dir) self.examples = self.processor.get_train_examples(self.test_dir / 'SST-2') sentence_piece_url = 'https://s3.amazonaws.com/models.huggingface.co/bert/xlnet-base-cased-spiece.model' contents = requests.get(sentence_piece_url) (self.test_dir / 'spiece.model').open('wb').write(contents.content) self.base_tokenizer = sentencepiece.SentencePieceProcessor() self.base_tokenizer.Load(str(self.test_dir / 'spiece.model')) self.rust_tokenizer = PySentencePieceTokenizer(str(self.test_dir / 'spiece.model'), do_lower_case=False)
def setup_rust_tokenizer(self): self.rust_tokenizer = PySentencePieceTokenizer(str(self.test_dir / 'spiece.model'), do_lower_case=False)