Example #1
0
 def testTitleCase(self):
     test_string = [
         u"abc", u"ABc", u"ABC", u"Abc", u"aBcd",
         u"\u01c8bc".encode("utf-8")
     ]
     shapes = text.wordshape(test_string, text.WordShape.HAS_TITLE_CASE)
     self.assertAllEqual(shapes, [False, False, False, True, False, True])
Example #2
0
 def testSomeDigits(self):
     test_string = [
         u"abc", u"a\u06f3m".encode("utf-8"), u"90\u06f3".encode("utf-8"),
         u"a9b8c7", u"9ab87c", u"\u06f3m\u06f3"
     ]
     shapes = text.wordshape(test_string, text.WordShape.HAS_SOME_DIGITS)
     self.assertAllEqual(shapes, [False, True, False, True, True, True])
Example #3
0
 def testMultipleShapes(self):
     test_string = [u"abc", u"ABc", u"ABC"]
     shapes = text.wordshape(
         test_string,
         [text.WordShape.IS_UPPERCASE, text.WordShape.IS_LOWERCASE])
     self.assertAllEqual(shapes,
                         [[False, True], [False, False], [True, False]])
Example #4
0
 def testEmoji(self):
     test_string = [
         u"\U0001f604m".encode("utf-8"), u"m\u2605m".encode("utf-8"),
         u"O:)", u"m\U0001f604".encode("utf-8"), u"\u2105k".encode("utf-8")
     ]
     shapes = text.wordshape(test_string, text.WordShape.HAS_EMOJI)
     self.assertAllEqual(shapes, [True, True, False, True, False])
Example #5
0
def basic_tokenize(text_input, lower_case=False, keep_whitespace=False):
    """Performs basic word tokenization for BERT.

  Args:
    text_input: A Tensor of untokenized strings.
    lower_case: A bool indicating whether or not to perform lowercasing. Default
      is False.
    keep_whitespace: A bool indicating whether or not whitespace tokens should
      be kept in the output
  """
    # lowercase and strip accents (if option is set)
    if lower_case:
        text_input = tf_text.case_fold_utf8(text_input)

    # normalize by NFD
    text_input = tf_text.normalize_utf8(text_input, "NFD")

    # strip out control characters
    text_input = tf.strings.regex_replace(text_input, r"\p{Cc}|\p{Cf}|\p{Mn}",
                                          "")

    # For chinese and emoji characters, tokenize by unicode codepoints
    script_tokenized = tf_text.unicode_script_tokenize(
        text_input, keep_whitespace=keep_whitespace, name="UTF-8")
    token_script_ids = tf.strings.unicode_script(
        tf.strings.unicode_decode(script_tokenized.flat_values, "UTF-8"))

    is_chinese = tf.equal(token_script_ids, _CHINESE_SCRIPT_ID)[:, :1].values
    is_emoji = tf_text.wordshape(script_tokenized.flat_values,
                                 tf_text.WordShape.HAS_EMOJI)
    is_punct = tf_text.wordshape(script_tokenized.flat_values,
                                 tf_text.WordShape.IS_PUNCT_OR_SYMBOL)
    split_cond = is_chinese | is_emoji | is_punct
    unicode_char_split = tf.strings.unicode_split(script_tokenized, "UTF-8")

    unicode_split_tokens = tf.where(split_cond,
                                    y=tf.expand_dims(
                                        script_tokenized.flat_values, 1),
                                    x=unicode_char_split.values)

    # Pack back into a [batch, (num_tokens), (num_unicode_chars)] RT
    chinese_mix_tokenized = tf.RaggedTensor.from_row_lengths(
        values=unicode_split_tokens,
        row_lengths=script_tokenized.row_lengths())

    # Squeeze out to a [batch, (num_tokens)] RT
    return collapse_dims(chinese_mix_tokenized)
Example #6
0
 def testCloseQuote(self):
     test_string = [
         u"''", u"ABc\"", u"\u300f".encode("utf-8"),
         u"\u2018".encode("utf-8"), u"aBcd", u"``"
     ]
     shapes = text.wordshape(test_string,
                             text.WordShape.ENDS_WITH_CLOSE_QUOTE)
     self.assertAllEqual(shapes, [True, True, True, False, False, False])
Example #7
0
 def testSomePunct(self):
     test_string = [
         u"abc", u"a;m".encode("utf-8"), u".,!".encode("utf-8"), u"[email protected],",
         u".ab8;c", u"\u0f08m\u0f08"
     ]
     shapes = text.wordshape(test_string,
                             text.WordShape.HAS_SOME_PUNCT_OR_SYMBOL)
     self.assertAllEqual(shapes, [False, True, False, True, True, True])
Example #8
0
 def testDashShape(self):
     test_string = [
         u"a-b", u"a\u2010b".encode("utf-8"), u"a\u2013b".encode("utf-8"),
         u"a\u2e3ab".encode("utf-8"), u"abc".encode("utf-8")
     ]
     shapes = text.wordshape(test_string,
                             text.WordShape.HAS_PUNCTUATION_DASH)
     self.assertAllEqual(shapes, [True, True, True, True, False])
Example #9
0
 def testWhitespace(self):
     test_string = [
         u" ", u"\v", u"\r\n", u"\u3000".encode("utf-8"), u" a", u"abc",
         u"a\nb", u"\u3000 \n".encode("utf-8")
     ]
     shapes = text.wordshape(test_string, text.WordShape.IS_WHITESPACE)
     self.assertAllEqual(
         shapes, [True, True, True, True, False, False, False, True])
Example #10
0
 def testOpenQuote(self):
     test_string = [
         u"''", u"ABc\"", u"\uff07".encode("utf-8"),
         u"\u2018".encode("utf-8"), u"aBcd", u"``"
     ]
     shapes = text.wordshape(test_string,
                             text.WordShape.BEGINS_WITH_OPEN_QUOTE)
     self.assertAllEqual(shapes, [False, False, True, True, False, True])
Example #11
0
 def testQuote(self):
     test_string = [
         u"''", u"ABc\"", u"\uff07".encode("utf-8"),
         u"\u2018".encode("utf-8"), u"aBcd", u"``",
         u"\u300d".encode("utf-8")
     ]
     shapes = text.wordshape(test_string, text.WordShape.HAS_QUOTE)
     self.assertAllEqual(shapes,
                         [True, True, True, True, False, True, True])
Example #12
0
 def testNonLetters(self):
     test_string = [
         u"''", u"ABc", u"\uff07".encode("utf-8"),
         u"\u2018".encode("utf-8"), u"aBcd", u"`#ab",
         u"\u300d".encode("utf-8")
     ]
     shapes = text.wordshape(test_string, text.WordShape.HAS_NON_LETTER)
     self.assertAllEqual(shapes,
                         [True, False, True, True, False, True, True])
Example #13
0
 def testNoPunct(self):
     test_string = [u"abc", u"a;m".encode("utf-8")]
     shapes = text.wordshape(test_string,
                             text.WordShape.HAS_NO_PUNCT_OR_SYMBOL)
     self.assertAllEqual(shapes, [True, False])
Example #14
0
 def testIsEmoticon(self):
     test_string = [u"abc", u":-)", u"O:)", u"8)x", u":\u3063C", u"abc:-)"]
     shapes = text.wordshape(test_string, text.WordShape.IS_EMOTICON)
     self.assertAllEqual(shapes, [False, True, False, False, True, False])
Example #15
0
 def testNumericValue(self):
     test_string = [u"98.6", u"-0.3", u"2.783E4", u"e4", u"1e10"]
     shapes = text.wordshape(test_string, text.WordShape.IS_NUMERIC_VALUE)
     self.assertAllEqual(shapes, [True, True, True, False, True])
Example #16
0
 def testOnlyDigits(self):
     test_string = [
         u"abc", u"a9b".encode("utf-8"), u"90\u06f3".encode("utf-8")
     ]
     shapes = text.wordshape(test_string, text.WordShape.HAS_ONLY_DIGITS)
     self.assertAllEqual(shapes, [False, False, True])
Example #17
0
 def testAcronym(self):
     test_string = [u"abc", u"A.B.", u"A.B.C.)", u"ABC"]
     shapes = text.wordshape(test_string,
                             text.WordShape.IS_ACRONYM_WITH_PERIODS)
     self.assertAllEqual(shapes, [False, True, False, False])
Example #18
0
 def testNoDigits(self):
     test_string = [u"abc", u"a\u06f3m".encode("utf-8")]
     shapes = text.wordshape(test_string, text.WordShape.HAS_NO_DIGITS)
     self.assertAllEqual(shapes, [True, False])
Example #19
0
 def testAllUppercase(self):
     test_string = [u"abc", u"ABc", u"ABC"]
     shapes = text.wordshape(test_string, text.WordShape.IS_UPPERCASE)
     self.assertAllEqual(shapes, [False, False, True])
Example #20
0
 def testNonShapePassedToShapeArg(self):
     test_string = [u"abc", u"ABc", u"ABC"]
     with self.assertRaises(TypeError):
         text.wordshape(test_string, "This is not a Shape")
Example #21
0
 def testAllLowercase(self):
     test_string = [u"abc", u"ABc", u"ABC"]
     shapes = text.wordshape(test_string, text.WordShape.IS_LOWERCASE)
     self.assertAllEqual(shapes, [True, False, False])
Example #22
0
 def testEllipsis(self):
     test_string = [
         u"abc", u"abc...", u"...abc", u"abc\u2026".encode("utf-8")
     ]
     shapes = text.wordshape(test_string, text.WordShape.ENDS_WITH_ELLIPSIS)
     self.assertAllEqual(shapes, [False, True, False, True])
Example #23
0
 def testLeadingPunct(self):
     test_string = [u"abc", u";b", u"b;", u";,\u0f08".encode("utf-8")]
     shapes = text.wordshape(test_string,
                             text.WordShape.BEGINS_WITH_PUNCT_OR_SYMBOL)
     self.assertAllEqual(shapes, [False, True, False, True])
Example #24
0
 def testMixedCase(self):
     test_string = [u"abc", u"ABc", u"ABC", u"abC"]
     shapes = text.wordshape(test_string, text.WordShape.HAS_MIXED_CASE)
     self.assertAllEqual(shapes, [False, True, False, True])
Example #25
0
 def testMixedCaseLetters(self):
     test_string = [u"abc", u"ABc", u"ABC", u"abC", u"abC."]
     shapes = text.wordshape(test_string,
                             text.WordShape.IS_MIXED_CASE_LETTERS)
     self.assertAllEqual(shapes, [False, True, False, True, False])
Example #26
0
 def testNoQuotes(self):
     test_string = [
         u"abc", u"\"ABc", u"ABC'", u"Abc\u201c".encode("utf-8"), u"aBcd"
     ]
     shapes = text.wordshape(test_string, text.WordShape.HAS_NO_QUOTES)
     self.assertAllEqual(shapes, [True, False, False, False, True])
Example #27
0
 def testAllPunct(self):
     test_string = [
         u"abc", u"a;b".encode("utf-8"), u";,\u0f08".encode("utf-8")
     ]
     shapes = text.wordshape(test_string, text.WordShape.IS_PUNCT_OR_SYMBOL)
     self.assertAllEqual(shapes, [False, False, True])
Example #28
0
 def testMathSymbol(self):
     test_string = [u"''", u"\u003c", u"\uff07".encode("utf-8")]
     shapes = text.wordshape(test_string, text.WordShape.HAS_MATH_SYMBOL)
     self.assertAllEqual(shapes, [False, True, False])
Example #29
0
print(offset_limits.to_list())

docs = tf.data.Dataset.from_tensor_slices([['Never tell me the odds.'],
                                           ["It's a trap!"]])
tokenizer = text.WhitespaceTokenizer()
tokenized_docs = docs.map(lambda x: tokenizer.tokenize(x))
iterator = iter(tokenized_docs)
print(next(iterator).to_list())
print(next(iterator).to_list())

tokenizer = text.WhitespaceTokenizer()
tokens = tokenizer.tokenize(
    ['Everything not saved will be lost.', u'Sad☹'.encode('UTF-8')])

# Is capitalized?
f1 = text.wordshape(tokens, text.WordShape.HAS_TITLE_CASE)
# Are all letters uppercased?
f2 = text.wordshape(tokens, text.WordShape.IS_UPPERCASE)
# Does the token contain punctuation?
f3 = text.wordshape(tokens, text.WordShape.HAS_SOME_PUNCT_OR_SYMBOL)
# Is the token a number?
f4 = text.wordshape(tokens, text.WordShape.IS_NUMERIC_VALUE)

print(f1.to_list())
print(f2.to_list())
print(f3.to_list())
print(f4.to_list())

tokenizer = text.WhitespaceTokenizer()
tokens = tokenizer.tokenize(
    ['Everything not saved will be lost.', u'Sad☹'.encode('UTF-8')])
Example #30
0
 def testCurrencySymbol(self):
     test_string = [u"''", u"ABc$", u"$\uff07".encode("utf-8")]
     shapes = text.wordshape(test_string,
                             text.WordShape.HAS_CURRENCY_SYMBOL)
     self.assertAllEqual(shapes, [False, True, False])