def test_sequence_long(self): """ Test case sequences that are too long should be truncated :return: """ sut = PreprocessorBertTokeniser(max_feature_len=5, tokeniser=None) sut.item = ["THE", "dog", "ate", "a", "biscuit"] expected = ["[CLS]", "THE", "dog", "ate", "[SEP]"] # Act sut.sequence_pad() # Assert self.assertSequenceEqual(expected, sut.item)
def test_sequence_short(self): """ Test case sequences that are too short should be padded :return: """ sut = PreprocessorBertTokeniser(max_feature_len=5, tokeniser=None) sut.item = ["THE"] expected = ["[CLS]", "THE", "[PAD]", "[PAD]", "[SEP]"] # Act sut.sequence_pad() # Assert self.assertSequenceEqual(expected, sut.item)
def get_preprocessor(self): self._logger.info("Retrieving Tokeniser") tokeniser = BertTokenizer.from_pretrained(self._bert_model_name, do_lower_case=self._token_lower_case) preprocessor = PreprocessorBertTokeniser(max_feature_len=self._max_seq_len, tokeniser=tokeniser) self._logger.info("Completed retrieving Tokeniser") return preprocessor
def get_preprocessor(self): tokeniser = BertTokenizer.from_pretrained( self._bert_model_name, do_lower_case=self._token_lower_case) preprocessor = PreprocessorBertTokeniser( max_feature_len=self._max_seq_len, tokeniser=tokeniser) return preprocessor