class Lemmatizer(object): '''Custom lemmatizer for normalizing inflected and irregular word forms.''' # Inflection rule exceptions exceptions = {"s": ["bus", "yes", "does", "his", "hers", "mess", "hiss", "woods", "stairs", "tights", "pants", "shorts", "jeans"], "ed": ["fled", "wicked", "tired", "reed", "weed", "feed", "need", "seed"], "ing": ["morning", "evening", "earring", "nothing"]} # Patterns vowel = r'[aeiou]' nonvowel = r'[^aeiou]' consonant = r'[bcdfghjklmnpqrstvwxz]' # Regex # pattern for geminated words such as "running": gem = re.compile(r'(?P<gem>[pbmtdnrgsvz])(?P=gem)(?P<suffix>ed|ing)') def __init__(self, s_rules=True, ed_rules=True, ing_rules=True, irr_rules=True): '''Defaults to using all rules. s_rules: normalize words ending in "s" ed_rules: normalize words ending in "ed" ing_rules: normalize words ending in "ing" irr_rules: normalize irregular words ''' self.spell = WordList() suffixes = [] if s_rules: suffixes.append("s") if ed_rules: suffixes.append("ed") if ing_rules: suffixes.append("ing") self.inflection = re.compile(r"(?:" + "|".join(suffixes) + r")$") self.irregular = False if irr_rules: self.irregular = IrregularForm() def __call__(self, word): '''Normalize word based on selected rules.''' return self.normalize(word) def normalize(self, word): '''Normalize word based on selected rules.''' if self.irregular: result = self.irregular.normalize(word) if result != word: return result inflection = self.is_inflected(word) if inflection: if "+" in word: (base, end) = word.rsplit("+", 1) normalize = getattr(self, "normalize_" + inflection) return base + "+" + normalize(end) else: normalize = getattr(self, "normalize_" + inflection) return normalize(word) else: return word def normalize_s(self, word): if len(word) < 3 or word in self.exceptions["s"]: return word stem = self.strip_suffix(word, "s") if self.spell.check(stem): return stem if word.endswith("ies"): stem = word[:-3] if self.spell.check(stem + "y"): return stem + "y" if word.endswith("ves"): if len(word) < 5: return word stem = word[:-3] if self.spell.check(stem + "f"): return stem + "f" if word.endswith("es"): stem = word[:-2] if self.spell.check(stem): return stem return word def normalize_ed(self, word): if len(word) < 4 or word in self.exceptions["ed"]: return word stem = self.strip_suffix(word, "ed") if word.endswith("ied") and stem not in ["ti", "li", "di"]: stem = word[:-3] if self.spell.check(stem + 'y'): return stem + 'y' else: return word if self.spell.check(stem) and stem not in ["us", "ti", "fe", "bl", "pleas", "scar"]: return stem elif self.spell.check(stem + 'e'): return stem + 'e' else: return word def normalize_ing(self, word): if len(word) < 5 or word in self.exceptions["ing"]: return word stem = self.strip_suffix(word, "ing") if self.spell.check(stem) and stem not in ["us", "com", "div", "writ"]: return stem elif self.spell.check(stem + 'e'): return stem + 'e' else: return word def is_inflected(self, word): '''Return inflectional morpheme if word is inflected or plural.''' match = self.inflection.search(word) if match: return match.group() def is_geminated(self, word): '''Test to see if a word is geminated.''' match = self.gem.search(word) if match: return match.group() def strip_suffix(self, word, suffix): '''Try to intelligently strip off suffix.''' n = len(suffix) if self.is_geminated(word): return word[:-(n+1)] else: return word[:-n]