def test_unknown_tokens_dont_show_up(self): text = "the beginning of the prompt" text_with_token = f"{text}{TransformerXLNetTokenTypes.UNKNOWN_TOKEN} tell me a story {TransformerXLNetTokenTypes.UNKNOWN_TOKEN}" result = serialize_text_algo_individual_values(text_with_token) unknown_count = result.count(TransformerXLNetTokenTypes.UNKNOWN_TOKEN) self.assertEqual(unknown_count, 0)
def test_end_of_prompts_removes_after(self): start = "the beginning of the prompt" text_with_beginning_promt = f"{start}{TransformerXLNetTokenTypes.ENDING_OF_PROMPT} nothing should show up" result = serialize_text_algo_individual_values( text_with_beginning_promt) self.assertEqual(result, start)
def test_end_of_paragraph_returns_double_space(self): text = "the beginning of the prompt" text_with_token = f"{text}{TransformerXLNetTokenTypes.ENDING_OF_PARAGRAPH}should have multiple newlines {TransformerXLNetTokenTypes.ENDING_OF_PARAGRAPH}" result = serialize_text_algo_individual_values(text_with_token) new_line_count = result.count("\n") # two end of paragraphs should return 4 new lines self.assertEqual(new_line_count, 4)
def test_gpt2_text_cleanup(self): fresh_prince = ( "Now this is a story all about how\n\nMy life got flipped upside down" ) two_of_us = "Just the two of us, building castles in the sky, Just the two of us, you and I" mock_response = fresh_prince + GPT2_END_TEXT_STRING + two_of_us serialized = serialize_text_algo_individual_values(mock_response) self.assertEqual(fresh_prince, serialized)
def test_gpt2_text_cleanup_remove_new_lines(self): too_many_newlines = "\n\nCat\n\n" serialized = serialize_text_algo_individual_values(too_many_newlines) self.assertEqual("Cat", serialized)