def test_level_1(self):
        d = {"1": 0, "2": 1, "3": 2, "4": 3, "5": 4}

        phoc = new_unigram_phoc("12345", unigram_map=d, unigram_levels=[1])
        self.assertEqual(phoc, (1, 1, 1, 1, 1))

        phoc = new_unigram_phoc("34", unigram_map=d, unigram_levels=[1])
        self.assertEqual(phoc, (0, 0, 1, 1, 0))

        phoc = new_unigram_phoc("1234512345",
                                unigram_map=d,
                                unigram_levels=[1])
        self.assertEqual(phoc, (1, 1, 1, 1, 1))
 def test_missing_unigram_warning(self):
     d = {"1": 0, "2": 1, "4": 2, "5": 3}
     phoc = new_unigram_phoc("12345",
                             unigram_map=d,
                             unigram_levels=[1],
                             ignore_missing=True)
     self.assertEqual(phoc, (1, 1, 1, 1))
 def test_missing_unigram_exception(self):
     d = {"1": 0, "2": 1, "4": 2, "5": 3}
     with self.assertRaises(KeyError):
         new_unigram_phoc("12345", unigram_map=d, unigram_levels=[1])
# Recover George Washington alphabet
alphabet = set()
for word in vocabulary:
    alphabet.update([ch for ch in word])
alphabet = sorted(list(alphabet))

# Obtain the different PHOCs and count how many words produce the
# same PHOC code.
unigram_map = {c: i for i, c in enumerate(alphabet)}

phoc_levels = 1
done = False
while not done:
    phoc_counter = {}
    for word in vocabulary:
        phoc = new_unigram_phoc(word, unigram_map, [phoc_levels])
        if phoc in phoc_counter:
            phoc_counter[phoc] += 1
        else:
            phoc_counter[phoc] = 1

    # Compute PHOC histogram:
    unique_phocs = [
        phoc for phoc, counter in viewitems(phoc_counter) if counter == 1
    ]
    print(phoc_levels, len(unique_phocs), len(unique_phocs) / len(vocabulary))
    phoc_levels += 1
    if len(unique_phocs) == len(vocabulary):
        done = True