Python normalize Examples

Programming Language: Python

Namespace/Package Name: standalone_nlp.lang_en.en_nl

Method/Function: normalize

Examples at hotexamples.com: 4

Python normalize - 4 examples found. These are the top rated real world Python examples of standalone_nlp.lang_en.en_nl.normalize extracted from open source projects. You can rate examples to help us improve the quality of examples.

Example #1

Show file

File: utils.py Project: Web5design/twittermap

def english_window(words, wsize=2):
    words = filter(None, ( re.sub(r"[^A-Za-z0-9' -]", '', w) for w in words ))
    for x in xrange(len(words) - wsize + 1):
        pair = ' '.join(words[x:x+wsize])
        caps = ''.join( w[0] for w in pair.split() )
        norm = en_nl.normalize(pair)
        if norm and ( ( '@' not in caps and caps.upper() == caps and pair.upper() != pair ) or norm in concepts ):
            yield norm.lower()

Example #2

Show file

File: utils.py Project: imclab/twittermap

def english_window(words, wsize=2):
    words = filter(None, (re.sub(r"[^A-Za-z0-9' -]", '', w) for w in words))
    for x in xrange(len(words) - wsize + 1):
        pair = ' '.join(words[x:x + wsize])
        caps = ''.join(w[0] for w in pair.split())
        norm = en_nl.normalize(pair)
        if norm and (('@' not in caps and caps.upper() == caps
                      and pair.upper() != pair) or norm in concepts):
            yield norm.lower()

Example #3

Show file

File: utils.py Project: Web5design/twittermap

def clean_twitter(phrase):
    phrase = re.sub(r'(.)\1{2,}', r'\1\1', re.sub(r'[^\x00-\x7f]', ' ', phrase))
    if is_bad_word(phrase.lower()):
        return
    parts = en_nl.tokenize(phrase).split()
    for part in itertools.chain(parts, english_window(parts)):
        if part.startswith(('#', '@', 'http:')):
            yield part
        elif part.strip() and part != 'rt' and not en_nl.is_stopword(part):
            part = en_nl.normalize(part).strip('-')
            if part.strip():
                yield part

Example #4

Show file

File: utils.py Project: imclab/twittermap

def clean_twitter(phrase):
    phrase = re.sub(r'(.)\1{2,}', r'\1\1', re.sub(r'[^\x00-\x7f]', ' ',
                                                  phrase))
    if is_bad_word(phrase.lower()):
        return
    parts = en_nl.tokenize(phrase).split()
    for part in itertools.chain(parts, english_window(parts)):
        if part.startswith(('#', '@', 'http:')):
            yield part
        elif part.strip() and part != 'rt' and not en_nl.is_stopword(part):
            part = en_nl.normalize(part).strip('-')
            if part.strip():
                yield part