def testOverrides(self): # run the inflection system once to assure the overrides is loaded (ie.. lazy loading) lemminflect.getInflection('watch', 'VBD'), ('watched', ) # Hack the code to replace the overrides dictionary orig_dict = lemminflect.Inflections().overrides_dict with self.assertLogs(): lemmas = lemminflect.getLemma('WORD', 'X') self.assertEqual(lemmas, ()) with self.assertLogs(): lemmas = lemminflect.getAllLemmas('WORD', 'X') self.assertEqual(lemmas, {}) with self.assertLogs(): lemmas = lemminflect.getAllLemmasOOV('WORD', 'X') self.assertEqual(lemmas, {}) token = self.nlp('I')[0] self.assertEqual(token._.lemma(), 'I') lemminflect.Inflections().overrides_dict = { 'watch': { 'VBD': ('xxx', ) } } inflections = lemminflect.getInflection('watch', 'VBD', inflect_oov=False) self.assertEqual(inflections, ('xxx', )) # put the original dictionary back lemminflect.Inflections().overrides_dict = orig_dict
def testProperNouns(self): infls = lemminflect.getInflection('Alaskan', 'NN', inflect_oov=False) self.assertEqual(len(infls), 0) infls = lemminflect.getInflection('Alaskan', 'NNP', inflect_oov=False) self.assertEqual(len(infls), 1) self.assertEqual(infls[0], 'Alaskan') infls = lemminflect.getInflection('Alaskan', 'NNPS', inflect_oov=False) self.assertEqual(len(infls), 1) self.assertEqual(infls[0], 'Alaskans') infls = lemminflect.getInflection('Axxlaskan', 'NNP', inflect_oov=True) self.assertEqual(len(infls), 1) self.assertEqual(infls[0], 'Axxlaskan') infls = lemminflect.getInflection('Axxlaskan', 'NNPS', inflect_oov=True) self.assertEqual(len(infls), 1) self.assertEqual(infls[0], 'Axxlaskans') lemminflect.Inflections().setUseInternalLemmatizer( True) # lemmatize with lemminflect token = self.nlp('The Alaskan went South.')[1] self.assertEqual(token._.inflect('NNPS', inflect_oov=False), 'Alaskans') token = self.nlp('The Axxlaskan went South.')[1] self.assertEqual(token._.inflect('NNPS', inflect_oov=True), 'Axxlaskans')
# Load the corpus to test with print('Loading corpus from ', corp_fn) sents = loadFile(corp_fn, max_sents) print('Loaded {:,} test sentences'.format(len(sents))) print() # Create an empty overrides file before calling lemminflect because it loads this file on # first use. This will mess-up the overrides creation process since overrides will be used. # Fix this issue by creating an empty file open(config.infl_overrides_fn, 'w').close() # Loop through the sentences and count the instances of (lemma, tag, corpus_word) # corpus_word is considered the "correct" inflection for the lemma/tag print('Processing sentences. Use internal lemmatizer = ', \ lemminflect.Inflections().isUsingInternalLemmatizer()) infl_ctr = Counter() pb = ProgressBar(len(sents)) for i, sent in enumerate(sents): doc = nlp(sent) for word in doc: # Filter out numbers, foreign characters, etc.. if not isASCIIWord(word.text) or not word.tag_: continue # Skip aux and modal aux verbs since they're oddballs anyway if word.lemma_.lower() in ['be', 'have', 'do', 'will', 'can', 'may', 'shall', 'will', \ 'ought', 'dare']: continue # Only inflect regular nouns, verbs, adverbs and adjectives # Don't check inflections of particles or proper nouns ptype = word.tag_[0]
def testSpacyInflect02(self): lemminflect.Inflections().setUseInternalLemmatizer( False) # lemmatize with spaCy self.testSpacyInflect01()
def __init__(self, *args, **kwargs): super(InflectionTests, self).__init__(*args, **kwargs) self.nlp = SPACY_NLP lemminflect.Inflections().setUseInternalLemmatizer( True) # lemmatize with lemminflect