def deaccent(text): """ Remove accentuation from the given string. Input text is either a unicode string or utf8 encoded bytestring. Return input string with accents removed, as unicode. >>> deaccent("Šéf chomutovských komunistů dostal poštou bílý prášek") u'Sef chomutovskych komunistu dostal postou bily prasek' """ if not isinstance(text, unicode): # assume utf8 for byte strings, use default (strict) error handling text = text.decode('utf8') norm = unicodedata.normalize("NFD", text) result = u('').join(ch for ch in norm if unicodedata.category(ch) != 'Mn') return unicodedata.normalize("NFC", result)
def lemmatize(content, allowed_tags=re.compile('(NN|VB|JJ|RB)'), light=False): """ This function is only available when the optional 'pattern' package is installed. Use the English lemmatizer from `pattern` to extract tokens in their base form=lemma, e.g. "are, is, being" -> "be" etc. This is a smarter version of stemming, taking word context into account. Only considers nouns, verbs, adjectives and adverbs by default (=all other lemmas are discarded). >>> lemmatize('Hello World! How is it going?! Nonexistentword, 21') ['world/NN', 'be/VB', 'go/VB', 'nonexistentword/NN'] >>> lemmatize('The study ranks high.') ['study/NN', 'rank/VB', 'high/JJ'] >>> lemmatize('The ranks study hard.') ['rank/NN', 'study/VB', 'hard/RB'] """ if light: import warnings warnings.warn("The light flag is no longer supported by pattern.") # tokenization in `pattern` is weird; it gets thrown off by non-letters, # producing '==relate/VBN' or '**/NN'... try to preprocess the text a little # FIXME this throws away all fancy parsing cues, including sentence structure, # abbreviations etc. content = u(' ').join(tokenize(content, lower=True, errors='ignore')) parsed = parse(content, lemmata=True, collapse=False) result = [] for sentence in parsed: for token, tag, _, _, lemma in sentence: if 2 <= len(lemma) <= 15 and not lemma.startswith('_'): if allowed_tags.match(tag): lemma += "/" + tag[:2] result.append(lemma.encode('utf8')) return result