class IrregularForm(object): '''Use for normalizing irregular word forms.''' def __init__(self): verb_file = WORDS_DIR + "/irregular/verb.txt" noun_file = WORDS_DIR + "/irregular/noun.txt" self.words = WordList(verb_file, noun_file) def normalize(self, word): ''' Return normalized word form if present, otherwise return original form. ''' return self.words.get(word, word)
def __init__(self, s_rules=True, ed_rules=True, ing_rules=True, irr_rules=True): '''Defaults to using all rules. s_rules: normalize words ending in "s" ed_rules: normalize words ending in "ed" ing_rules: normalize words ending in "ing" irr_rules: normalize irregular words ''' self.spell = WordList() suffixes = [] if s_rules: suffixes.append("s") if ed_rules: suffixes.append("ed") if ing_rules: suffixes.append("ing") self.inflection = re.compile(r"(?:" + "|".join(suffixes) + r")$") self.irregular = False if irr_rules: self.irregular = IrregularForm()
def __init__(self): verb_file = WORDS_DIR + "/irregular/verb.txt" noun_file = WORDS_DIR + "/irregular/noun.txt" self.words = WordList(verb_file, noun_file)
class Lemmatizer(object): '''Custom lemmatizer for normalizing inflected and irregular word forms.''' # Inflection rule exceptions exceptions = {"s": ["bus", "yes", "does", "his", "hers", "mess", "hiss", "woods", "stairs", "tights", "pants", "shorts", "jeans"], "ed": ["fled", "wicked", "tired", "reed", "weed", "feed", "need", "seed"], "ing": ["morning", "evening", "earring", "nothing"]} # Patterns vowel = r'[aeiou]' nonvowel = r'[^aeiou]' consonant = r'[bcdfghjklmnpqrstvwxz]' # Regex # pattern for geminated words such as "running": gem = re.compile(r'(?P<gem>[pbmtdnrgsvz])(?P=gem)(?P<suffix>ed|ing)') def __init__(self, s_rules=True, ed_rules=True, ing_rules=True, irr_rules=True): '''Defaults to using all rules. s_rules: normalize words ending in "s" ed_rules: normalize words ending in "ed" ing_rules: normalize words ending in "ing" irr_rules: normalize irregular words ''' self.spell = WordList() suffixes = [] if s_rules: suffixes.append("s") if ed_rules: suffixes.append("ed") if ing_rules: suffixes.append("ing") self.inflection = re.compile(r"(?:" + "|".join(suffixes) + r")$") self.irregular = False if irr_rules: self.irregular = IrregularForm() def __call__(self, word): '''Normalize word based on selected rules.''' return self.normalize(word) def normalize(self, word): '''Normalize word based on selected rules.''' if self.irregular: result = self.irregular.normalize(word) if result != word: return result inflection = self.is_inflected(word) if inflection: if "+" in word: (base, end) = word.rsplit("+", 1) normalize = getattr(self, "normalize_" + inflection) return base + "+" + normalize(end) else: normalize = getattr(self, "normalize_" + inflection) return normalize(word) else: return word def normalize_s(self, word): if len(word) < 3 or word in self.exceptions["s"]: return word stem = self.strip_suffix(word, "s") if self.spell.check(stem): return stem if word.endswith("ies"): stem = word[:-3] if self.spell.check(stem + "y"): return stem + "y" if word.endswith("ves"): if len(word) < 5: return word stem = word[:-3] if self.spell.check(stem + "f"): return stem + "f" if word.endswith("es"): stem = word[:-2] if self.spell.check(stem): return stem return word def normalize_ed(self, word): if len(word) < 4 or word in self.exceptions["ed"]: return word stem = self.strip_suffix(word, "ed") if word.endswith("ied") and stem not in ["ti", "li", "di"]: stem = word[:-3] if self.spell.check(stem + 'y'): return stem + 'y' else: return word if self.spell.check(stem) and stem not in ["us", "ti", "fe", "bl", "pleas", "scar"]: return stem elif self.spell.check(stem + 'e'): return stem + 'e' else: return word def normalize_ing(self, word): if len(word) < 5 or word in self.exceptions["ing"]: return word stem = self.strip_suffix(word, "ing") if self.spell.check(stem) and stem not in ["us", "com", "div", "writ"]: return stem elif self.spell.check(stem + 'e'): return stem + 'e' else: return word def is_inflected(self, word): '''Return inflectional morpheme if word is inflected or plural.''' match = self.inflection.search(word) if match: return match.group() def is_geminated(self, word): '''Test to see if a word is geminated.''' match = self.gem.search(word) if match: return match.group() def strip_suffix(self, word, suffix): '''Try to intelligently strip off suffix.''' n = len(suffix) if self.is_geminated(word): return word[:-(n+1)] else: return word[:-n]
def test_add(): '''Testing add method''' testwords = WordList(WORDS_DIR + "/test.txt") assert 'baz' not in testwords testwords.add('baz') # add "baz" to active word list assert 'baz' in testwords, '"baz" was added to WordList'