Ejemplo n.º 1
0
  def test_filter_and_bucket_subtokens(self):
    subtoken_counts = collections.defaultdict(
        int, {"a": 2, "b": 4, "c": 1, "ab": 6, "ac": 3, "abbc": 5})
    min_count = 3

    subtoken_buckets = tokenizer._filter_and_bucket_subtokens(
        subtoken_counts, min_count)

    self.assertEqual(len(subtoken_buckets[0]), 0)
    self.assertEqual(set("b"), subtoken_buckets[1])
    self.assertEqual(set(["ab", "ac"]), subtoken_buckets[2])
    self.assertEqual(len(subtoken_buckets[3]), 0)
    self.assertEqual(set(["abbc"]), subtoken_buckets[4])
Ejemplo n.º 2
0
  def test_filter_and_bucket_subtokens(self):
    subtoken_counts = collections.defaultdict(
        int, {"a": 2, "b": 4, "c": 1, "ab": 6, "ac": 3, "abbc": 5})
    min_count = 3

    subtoken_buckets = tokenizer._filter_and_bucket_subtokens(
        subtoken_counts, min_count)

    self.assertEqual(len(subtoken_buckets[0]), 0)
    self.assertEqual(set("b"), subtoken_buckets[1])
    self.assertEqual(set(["ab", "ac"]), subtoken_buckets[2])
    self.assertEqual(len(subtoken_buckets[3]), 0)
    self.assertEqual(set(["abbc"]), subtoken_buckets[4])