Example #1
0
    def test_tokenization_sentence_piece(self):
        # Given
        self.base_tokenizer = sentencepiece.SentencePieceProcessor()
        self.base_tokenizer.Load(str(self.test_dir / 'spiece.model'))
        self.rust_tokenizer = PySentencePieceTokenizer(str(self.test_dir /
                                                           'spiece.model'),
                                                       do_lower_case=False)
        output_baseline = []
        for example in self.examples:
            output_baseline.append(
                self.base_tokenizer.EncodeAsIds(example.text_a))

        # When
        # Note: the original sentence piece tokenizer strips trailing spaces
        output_rust = self.rust_tokenizer.encode_list(
            [example.text_a.strip() for example in self.examples],
            max_len=256,
            truncation_strategy='longest_first',
            stride=0)

        # Then
        for idx, (rust,
                  baseline) in enumerate(zip(output_rust, output_baseline)):
            if rust.token_ids != baseline:
                assert sum(self.base_tokenizer.get_score(baseline)) == \
                       sum(self.base_tokenizer.get_score(rust.token_ids)), \
                    f'Difference in tokenization for {self.rust_tokenizer.__class__}: \n ' \
                    f'Sentence a: {self.examples[idx].text_a} \n' \
                    f'Sentence b: {self.examples[idx].text_b} \n' \
                    f'Token mismatch: {self.get_token_diff_sentence_piece(rust.token_ids, baseline)} \n' \
                    f'Rust: {rust.token_ids} \n' \
                    f'Python {baseline}'
Example #2
0
 def setup_class(self):
     self.processor = Sst2Processor()
     self.test_dir = Path(tempfile.mkdtemp())
     sst2_url = 'https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FSST-2.zip?alt=media&token=aabc5f6b-e466-44a2-b9b4-cf6337f84ac8'
     contents = requests.get(sst2_url)
     (self.test_dir / 'SST-2.zip').open('wb').write(contents.content)
     with ZipFile(self.test_dir / 'SST-2.zip', 'r') as zipObj:
         zipObj.extractall(self.test_dir)
     self.examples = self.processor.get_train_examples(self.test_dir /
                                                       'SST-2')
     sentence_piece_url = 'https://s3.amazonaws.com/models.huggingface.co/bert/xlnet-base-cased-spiece.model'
     contents = requests.get(sentence_piece_url)
     (self.test_dir / 'spiece.model').open('wb').write(contents.content)
     self.base_tokenizer = sentencepiece.SentencePieceProcessor()
     self.base_tokenizer.Load(str(self.test_dir / 'spiece.model'))
     self.rust_tokenizer = PySentencePieceTokenizer(str(self.test_dir /
                                                        'spiece.model'),
                                                    do_lower_case=False)
Example #3
0
 def setup_rust_tokenizer(self):
     self.rust_tokenizer = PySentencePieceTokenizer(str(self.test_dir /
                                                        'spiece.model'),
                                                    do_lower_case=False)