Exemple #1
0
class IrregularForm(object):
    '''Use for normalizing irregular word forms.'''

    def __init__(self):
        verb_file = WORDS_DIR + "/irregular/verb.txt"
        noun_file = WORDS_DIR + "/irregular/noun.txt"
        self.words = WordList(verb_file, noun_file)

    def normalize(self, word):
        '''
        Return normalized word form if present, otherwise return 
        original form.

        '''
        return self.words.get(word, word)
Exemple #2
0
    def __init__(self, s_rules=True, ed_rules=True, ing_rules=True, 
                                                    irr_rules=True):
        '''Defaults to using all rules.

        s_rules: normalize words ending in "s"
        ed_rules: normalize words ending in "ed"
        ing_rules: normalize words ending in "ing"
        irr_rules: normalize irregular words
        
        '''
        self.spell = WordList()
        suffixes = []
        if s_rules: suffixes.append("s")
        if ed_rules: suffixes.append("ed")
        if ing_rules: suffixes.append("ing")
        self.inflection = re.compile(r"(?:" + "|".join(suffixes) + r")$")
        self.irregular = False
        if irr_rules:
            self.irregular = IrregularForm()
Exemple #3
0
 def __init__(self):
     verb_file = WORDS_DIR + "/irregular/verb.txt"
     noun_file = WORDS_DIR + "/irregular/noun.txt"
     self.words = WordList(verb_file, noun_file)
Exemple #4
0
class Lemmatizer(object):
    '''Custom lemmatizer for normalizing inflected and irregular word forms.'''

    # Inflection rule exceptions
    exceptions = {"s": ["bus", "yes", "does", "his", "hers", "mess", "hiss", 
                        "woods", "stairs", "tights", "pants", "shorts", "jeans"],
                  "ed": ["fled", "wicked", "tired",
                         "reed", "weed", "feed", "need", "seed"],
                  "ing": ["morning", "evening", "earring", "nothing"]}

    # Patterns
    vowel = r'[aeiou]'
    nonvowel = r'[^aeiou]'
    consonant = r'[bcdfghjklmnpqrstvwxz]'

    # Regex
    # pattern for geminated words such as "running":
    gem = re.compile(r'(?P<gem>[pbmtdnrgsvz])(?P=gem)(?P<suffix>ed|ing)')

    def __init__(self, s_rules=True, ed_rules=True, ing_rules=True, 
                                                    irr_rules=True):
        '''Defaults to using all rules.

        s_rules: normalize words ending in "s"
        ed_rules: normalize words ending in "ed"
        ing_rules: normalize words ending in "ing"
        irr_rules: normalize irregular words
        
        '''
        self.spell = WordList()
        suffixes = []
        if s_rules: suffixes.append("s")
        if ed_rules: suffixes.append("ed")
        if ing_rules: suffixes.append("ing")
        self.inflection = re.compile(r"(?:" + "|".join(suffixes) + r")$")
        self.irregular = False
        if irr_rules:
            self.irregular = IrregularForm()

    def __call__(self, word):
        '''Normalize word based on selected rules.'''
        return self.normalize(word)

    def normalize(self, word):
        '''Normalize word based on selected rules.'''
        if self.irregular:
            result = self.irregular.normalize(word)
            if result != word:
                return result
        inflection = self.is_inflected(word)
        if inflection:
            if "+" in word:
                (base, end) = word.rsplit("+", 1)
                normalize = getattr(self, "normalize_" + inflection)
                return base + "+" + normalize(end)
            else:
                normalize = getattr(self, "normalize_" + inflection)
                return normalize(word)
        else:
            return word

    def normalize_s(self, word):
        if len(word) < 3 or word in self.exceptions["s"]:
            return word
        stem = self.strip_suffix(word, "s")
        if self.spell.check(stem):
            return stem
        if word.endswith("ies"):
            stem = word[:-3]
            if self.spell.check(stem + "y"):
                return stem + "y"
        if word.endswith("ves"):
            if len(word) < 5:
                return word
            stem = word[:-3]
            if self.spell.check(stem + "f"):
                return stem + "f"
        if word.endswith("es"):
            stem = word[:-2]
            if self.spell.check(stem):
                return stem
        return word

    def normalize_ed(self, word):
        if len(word) < 4 or word in self.exceptions["ed"]:
            return word
        stem = self.strip_suffix(word, "ed")
        if word.endswith("ied") and stem not in ["ti", "li", "di"]:
            stem = word[:-3]
            if self.spell.check(stem + 'y'):
                return stem + 'y'
            else:
                return word
        if self.spell.check(stem) and stem not in ["us", "ti", "fe", "bl", 
                                                     "pleas", "scar"]:
            return stem
        elif self.spell.check(stem + 'e'):
            return stem + 'e'
        else:
            return word

    def normalize_ing(self, word):
        if len(word) < 5 or word in self.exceptions["ing"]:
            return word
        stem = self.strip_suffix(word, "ing")
        if self.spell.check(stem) and stem not in ["us", "com", "div", "writ"]:
            return stem
        elif self.spell.check(stem + 'e'):
            return stem + 'e'
        else:
            return word

    def is_inflected(self, word):
        '''Return inflectional morpheme if word is inflected or plural.'''
        match = self.inflection.search(word)
        if match:
            return match.group()

    def is_geminated(self, word):
        '''Test to see if a word is geminated.'''
        match = self.gem.search(word)
        if match:
            return match.group()

    def strip_suffix(self, word, suffix):
        '''Try to intelligently strip off suffix.'''
        n = len(suffix)
        if self.is_geminated(word):
            return word[:-(n+1)]
        else:
            return word[:-n]
Exemple #5
0
def test_add():
    '''Testing add method'''
    testwords = WordList(WORDS_DIR + "/test.txt")
    assert 'baz' not in testwords
    testwords.add('baz')    # add "baz" to active word list
    assert 'baz' in testwords, '"baz" was added to WordList'