def setUpClass(cls): """set up wikitext examples with expected output for each parser step. examples -- dictionary of word: text, where text is a dictionary text['text'] -- full wikitext text['lang'] -- only the language section text['pron'] -- only the pronunciation section of the language section text['ipa'] -- only the extracted ipa """ # examples = {'word': {'text': ..., 'lang': ...}, 'word2': {...}} cls.examples = wp.json_load('test/wikitext_examples.json')
def test_extract_ipa_meets_threshold(self): pronunciation = wp.json_load('test/pron.json') self.assertGreater(len(pronunciation), 45000) # dictionaries from words to lists of ipa ipa = {} ipa_lenient = {} for word, pron_section in pronunciation.items(): pron_info = wp.Wikitext(pron_section).extract_pronunciation() if 'ipa' in pron_info: ipa[word] = pron_info['ipa'] ipa_lenient_results = wp.Wikitext(pron_section).extract_ipa_lenient() if ipa_lenient_results: ipa_lenient[word] = ipa_lenient_results ipa_diff = {k: pronunciation[k] for k in ipa_lenient.keys() if k not in ipa or len(ipa_lenient[k]) > len(ipa[k])} self.assertGreater(len(ipa_lenient), 32000) self.assertGreater(len(ipa), 32000) self.assertGreater(len(ipa_diff), 500)