def testTitleCase(self): test_string = [ u"abc", u"ABc", u"ABC", u"Abc", u"aBcd", u"\u01c8bc".encode("utf-8") ] shapes = text.wordshape(test_string, text.WordShape.HAS_TITLE_CASE) self.assertAllEqual(shapes, [False, False, False, True, False, True])
def testSomeDigits(self): test_string = [ u"abc", u"a\u06f3m".encode("utf-8"), u"90\u06f3".encode("utf-8"), u"a9b8c7", u"9ab87c", u"\u06f3m\u06f3" ] shapes = text.wordshape(test_string, text.WordShape.HAS_SOME_DIGITS) self.assertAllEqual(shapes, [False, True, False, True, True, True])
def testMultipleShapes(self): test_string = [u"abc", u"ABc", u"ABC"] shapes = text.wordshape( test_string, [text.WordShape.IS_UPPERCASE, text.WordShape.IS_LOWERCASE]) self.assertAllEqual(shapes, [[False, True], [False, False], [True, False]])
def testEmoji(self): test_string = [ u"\U0001f604m".encode("utf-8"), u"m\u2605m".encode("utf-8"), u"O:)", u"m\U0001f604".encode("utf-8"), u"\u2105k".encode("utf-8") ] shapes = text.wordshape(test_string, text.WordShape.HAS_EMOJI) self.assertAllEqual(shapes, [True, True, False, True, False])
def basic_tokenize(text_input, lower_case=False, keep_whitespace=False): """Performs basic word tokenization for BERT. Args: text_input: A Tensor of untokenized strings. lower_case: A bool indicating whether or not to perform lowercasing. Default is False. keep_whitespace: A bool indicating whether or not whitespace tokens should be kept in the output """ # lowercase and strip accents (if option is set) if lower_case: text_input = tf_text.case_fold_utf8(text_input) # normalize by NFD text_input = tf_text.normalize_utf8(text_input, "NFD") # strip out control characters text_input = tf.strings.regex_replace(text_input, r"\p{Cc}|\p{Cf}|\p{Mn}", "") # For chinese and emoji characters, tokenize by unicode codepoints script_tokenized = tf_text.unicode_script_tokenize( text_input, keep_whitespace=keep_whitespace, name="UTF-8") token_script_ids = tf.strings.unicode_script( tf.strings.unicode_decode(script_tokenized.flat_values, "UTF-8")) is_chinese = tf.equal(token_script_ids, _CHINESE_SCRIPT_ID)[:, :1].values is_emoji = tf_text.wordshape(script_tokenized.flat_values, tf_text.WordShape.HAS_EMOJI) is_punct = tf_text.wordshape(script_tokenized.flat_values, tf_text.WordShape.IS_PUNCT_OR_SYMBOL) split_cond = is_chinese | is_emoji | is_punct unicode_char_split = tf.strings.unicode_split(script_tokenized, "UTF-8") unicode_split_tokens = tf.where(split_cond, y=tf.expand_dims( script_tokenized.flat_values, 1), x=unicode_char_split.values) # Pack back into a [batch, (num_tokens), (num_unicode_chars)] RT chinese_mix_tokenized = tf.RaggedTensor.from_row_lengths( values=unicode_split_tokens, row_lengths=script_tokenized.row_lengths()) # Squeeze out to a [batch, (num_tokens)] RT return collapse_dims(chinese_mix_tokenized)
def testCloseQuote(self): test_string = [ u"''", u"ABc\"", u"\u300f".encode("utf-8"), u"\u2018".encode("utf-8"), u"aBcd", u"``" ] shapes = text.wordshape(test_string, text.WordShape.ENDS_WITH_CLOSE_QUOTE) self.assertAllEqual(shapes, [True, True, True, False, False, False])
def testSomePunct(self): test_string = [ u"abc", u"a;m".encode("utf-8"), u".,!".encode("utf-8"), u"[email protected],", u".ab8;c", u"\u0f08m\u0f08" ] shapes = text.wordshape(test_string, text.WordShape.HAS_SOME_PUNCT_OR_SYMBOL) self.assertAllEqual(shapes, [False, True, False, True, True, True])
def testDashShape(self): test_string = [ u"a-b", u"a\u2010b".encode("utf-8"), u"a\u2013b".encode("utf-8"), u"a\u2e3ab".encode("utf-8"), u"abc".encode("utf-8") ] shapes = text.wordshape(test_string, text.WordShape.HAS_PUNCTUATION_DASH) self.assertAllEqual(shapes, [True, True, True, True, False])
def testWhitespace(self): test_string = [ u" ", u"\v", u"\r\n", u"\u3000".encode("utf-8"), u" a", u"abc", u"a\nb", u"\u3000 \n".encode("utf-8") ] shapes = text.wordshape(test_string, text.WordShape.IS_WHITESPACE) self.assertAllEqual( shapes, [True, True, True, True, False, False, False, True])
def testOpenQuote(self): test_string = [ u"''", u"ABc\"", u"\uff07".encode("utf-8"), u"\u2018".encode("utf-8"), u"aBcd", u"``" ] shapes = text.wordshape(test_string, text.WordShape.BEGINS_WITH_OPEN_QUOTE) self.assertAllEqual(shapes, [False, False, True, True, False, True])
def testQuote(self): test_string = [ u"''", u"ABc\"", u"\uff07".encode("utf-8"), u"\u2018".encode("utf-8"), u"aBcd", u"``", u"\u300d".encode("utf-8") ] shapes = text.wordshape(test_string, text.WordShape.HAS_QUOTE) self.assertAllEqual(shapes, [True, True, True, True, False, True, True])
def testNonLetters(self): test_string = [ u"''", u"ABc", u"\uff07".encode("utf-8"), u"\u2018".encode("utf-8"), u"aBcd", u"`#ab", u"\u300d".encode("utf-8") ] shapes = text.wordshape(test_string, text.WordShape.HAS_NON_LETTER) self.assertAllEqual(shapes, [True, False, True, True, False, True, True])
def testNoPunct(self): test_string = [u"abc", u"a;m".encode("utf-8")] shapes = text.wordshape(test_string, text.WordShape.HAS_NO_PUNCT_OR_SYMBOL) self.assertAllEqual(shapes, [True, False])
def testIsEmoticon(self): test_string = [u"abc", u":-)", u"O:)", u"8)x", u":\u3063C", u"abc:-)"] shapes = text.wordshape(test_string, text.WordShape.IS_EMOTICON) self.assertAllEqual(shapes, [False, True, False, False, True, False])
def testNumericValue(self): test_string = [u"98.6", u"-0.3", u"2.783E4", u"e4", u"1e10"] shapes = text.wordshape(test_string, text.WordShape.IS_NUMERIC_VALUE) self.assertAllEqual(shapes, [True, True, True, False, True])
def testOnlyDigits(self): test_string = [ u"abc", u"a9b".encode("utf-8"), u"90\u06f3".encode("utf-8") ] shapes = text.wordshape(test_string, text.WordShape.HAS_ONLY_DIGITS) self.assertAllEqual(shapes, [False, False, True])
def testAcronym(self): test_string = [u"abc", u"A.B.", u"A.B.C.)", u"ABC"] shapes = text.wordshape(test_string, text.WordShape.IS_ACRONYM_WITH_PERIODS) self.assertAllEqual(shapes, [False, True, False, False])
def testNoDigits(self): test_string = [u"abc", u"a\u06f3m".encode("utf-8")] shapes = text.wordshape(test_string, text.WordShape.HAS_NO_DIGITS) self.assertAllEqual(shapes, [True, False])
def testAllUppercase(self): test_string = [u"abc", u"ABc", u"ABC"] shapes = text.wordshape(test_string, text.WordShape.IS_UPPERCASE) self.assertAllEqual(shapes, [False, False, True])
def testNonShapePassedToShapeArg(self): test_string = [u"abc", u"ABc", u"ABC"] with self.assertRaises(TypeError): text.wordshape(test_string, "This is not a Shape")
def testAllLowercase(self): test_string = [u"abc", u"ABc", u"ABC"] shapes = text.wordshape(test_string, text.WordShape.IS_LOWERCASE) self.assertAllEqual(shapes, [True, False, False])
def testEllipsis(self): test_string = [ u"abc", u"abc...", u"...abc", u"abc\u2026".encode("utf-8") ] shapes = text.wordshape(test_string, text.WordShape.ENDS_WITH_ELLIPSIS) self.assertAllEqual(shapes, [False, True, False, True])
def testLeadingPunct(self): test_string = [u"abc", u";b", u"b;", u";,\u0f08".encode("utf-8")] shapes = text.wordshape(test_string, text.WordShape.BEGINS_WITH_PUNCT_OR_SYMBOL) self.assertAllEqual(shapes, [False, True, False, True])
def testMixedCase(self): test_string = [u"abc", u"ABc", u"ABC", u"abC"] shapes = text.wordshape(test_string, text.WordShape.HAS_MIXED_CASE) self.assertAllEqual(shapes, [False, True, False, True])
def testMixedCaseLetters(self): test_string = [u"abc", u"ABc", u"ABC", u"abC", u"abC."] shapes = text.wordshape(test_string, text.WordShape.IS_MIXED_CASE_LETTERS) self.assertAllEqual(shapes, [False, True, False, True, False])
def testNoQuotes(self): test_string = [ u"abc", u"\"ABc", u"ABC'", u"Abc\u201c".encode("utf-8"), u"aBcd" ] shapes = text.wordshape(test_string, text.WordShape.HAS_NO_QUOTES) self.assertAllEqual(shapes, [True, False, False, False, True])
def testAllPunct(self): test_string = [ u"abc", u"a;b".encode("utf-8"), u";,\u0f08".encode("utf-8") ] shapes = text.wordshape(test_string, text.WordShape.IS_PUNCT_OR_SYMBOL) self.assertAllEqual(shapes, [False, False, True])
def testMathSymbol(self): test_string = [u"''", u"\u003c", u"\uff07".encode("utf-8")] shapes = text.wordshape(test_string, text.WordShape.HAS_MATH_SYMBOL) self.assertAllEqual(shapes, [False, True, False])
print(offset_limits.to_list()) docs = tf.data.Dataset.from_tensor_slices([['Never tell me the odds.'], ["It's a trap!"]]) tokenizer = text.WhitespaceTokenizer() tokenized_docs = docs.map(lambda x: tokenizer.tokenize(x)) iterator = iter(tokenized_docs) print(next(iterator).to_list()) print(next(iterator).to_list()) tokenizer = text.WhitespaceTokenizer() tokens = tokenizer.tokenize( ['Everything not saved will be lost.', u'Sad☹'.encode('UTF-8')]) # Is capitalized? f1 = text.wordshape(tokens, text.WordShape.HAS_TITLE_CASE) # Are all letters uppercased? f2 = text.wordshape(tokens, text.WordShape.IS_UPPERCASE) # Does the token contain punctuation? f3 = text.wordshape(tokens, text.WordShape.HAS_SOME_PUNCT_OR_SYMBOL) # Is the token a number? f4 = text.wordshape(tokens, text.WordShape.IS_NUMERIC_VALUE) print(f1.to_list()) print(f2.to_list()) print(f3.to_list()) print(f4.to_list()) tokenizer = text.WhitespaceTokenizer() tokens = tokenizer.tokenize( ['Everything not saved will be lost.', u'Sad☹'.encode('UTF-8')])
def testCurrencySymbol(self): test_string = [u"''", u"ABc$", u"$\uff07".encode("utf-8")] shapes = text.wordshape(test_string, text.WordShape.HAS_CURRENCY_SYMBOL) self.assertAllEqual(shapes, [False, True, False])