def __init__(self, filename=None, num_reserved_ids=2): """Initialize and read from a file, if provided.""" self._tokenizer = tokenizer.Tokenizer() if filename is not None: self._load_from_file(filename) super(SubwordTextEncoder, self).__init__(num_reserved_ids=num_reserved_ids)
def testInvertibilityOnRandomStrings(self): t = tokenizer.Tokenizer() random.seed(123) for _ in xrange(1000): s = u"".join( [unichr(random.randint(0, 65535)) for _ in xrange(10)]) self.assertEqual(s, t.decode(t.encode(s)))
def testEncode(self): t = tokenizer.Tokenizer() self.assertEqual( t.encode(u"Dude - that's so cool."), [u"Dude", u" - ", u"that", u"'", u"s", u"so", u"cool", u"."]) self.assertEqual(t.encode(u"Łukasz est né en 1981."), [u"Łukasz", u"est", u"né", u"en", u"1981", u"."]) self.assertEqual(t.encode(u" Spaces at the ends "), [u" ", u"Spaces", u"at", u"the", u"ends", u" "]) self.assertEqual(t.encode(u"802.11b"), [u"802", u".", u"11b"]) self.assertEqual(t.encode(u"two. \nlines"), [u"two", u". \n", u"lines"])
def get_token_counts(cls, text_filepattern, corpus_max_lines): """Read the corpus and compute a dictionary of token counts.""" tok = tokenizer.Tokenizer() lines_read = 0 filenames = tf.gfile.Glob(text_filepattern) for text_filename in filenames: with tf.gfile.Open(text_filename) as f: for line in f: # The tokenizer updates token_counts in encode() tok.encode(_native_to_unicode(line.strip())) lines_read += 1 if corpus_max_lines > 0 and lines_read > corpus_max_lines: return tok.token_counts return tok.token_counts
def testEncode(self): t = tokenizer.Tokenizer() self.assertEqual( t.encode("Dude - that's so cool."), ["Dude", " - ", "that", "'", "s", "so", "cool", "."]) # TODO(lukaszkaiser): make it work again with Unicode. # self.assertEqual( # t.encode("Łukasz est né en 1981."), # ["Łukasz", "est", "né", "en", "1981", "."]) self.assertEqual( t.encode(" Spaces at the ends "), [" ", "Spaces", "at", "the", "ends", " "]) self.assertEqual(t.encode("802.11b"), ["802", ".", "11b"]) self.assertEqual(t.encode("two. \nlines"), ["two", ". \n", "lines"])
def get_token_counts(cls, text_filepattern, corpus_max_lines): """Read the corpus and compute a dictionary of word counts.""" tok = tokenizer.Tokenizer() token_counts = {} lines_read = 0 filenames = tf.gfile.Glob(text_filepattern) for text_filename in filenames: with tf.gfile.Open(text_filename) as f: for line in f: tokens = tok.encode(line.strip()) for t in tokens: token_counts[t] = token_counts.get(t, 0) + 1 lines_read += 1 if corpus_max_lines > 0 and lines_read > corpus_max_lines: return token_counts return token_counts
def testDecode(self): t = tokenizer.Tokenizer() self.assertEqual( t.decode( [u"Dude", u" - ", u"that", u"'", u"s", u"so", u"cool", u"."]), u"Dude - that's so cool.")
def testDecode(self): t = tokenizer.Tokenizer() self.assertEqual( t.decode(["你", "好", "-", "?"]), "你 好 - ?")
def testEncode(self): t = tokenizer.Tokenizer() self.assertEqual( t.encode("你 好 - ?"), ["你", "好", "-", "?"])
def testInvertibilityOnRandomStrings(self): t = tokenizer.Tokenizer() random.seed(123) for _ in xrange(0): # TODO(lukaszkaiser): make it work again with Unicode. s = "".join([six.int2byte(random.randint(0, 255)) for _ in xrange(10)]) self.assertEqual(s, t.decode(t.encode(s)))