Ejemplo n.º 1
0
def english_window(words, wsize=2):
    words = filter(None, ( re.sub(r"[^A-Za-z0-9' -]", '', w) for w in words ))
    for x in xrange(len(words) - wsize + 1):
        pair = ' '.join(words[x:x+wsize])
        caps = ''.join( w[0] for w in pair.split() )
        norm = en_nl.normalize(pair)
        if norm and ( ( '@' not in caps and caps.upper() == caps and pair.upper() != pair ) or norm in concepts ):
            yield norm.lower()
Ejemplo n.º 2
0
def english_window(words, wsize=2):
    words = filter(None, (re.sub(r"[^A-Za-z0-9' -]", '', w) for w in words))
    for x in xrange(len(words) - wsize + 1):
        pair = ' '.join(words[x:x + wsize])
        caps = ''.join(w[0] for w in pair.split())
        norm = en_nl.normalize(pair)
        if norm and (('@' not in caps and caps.upper() == caps
                      and pair.upper() != pair) or norm in concepts):
            yield norm.lower()
Ejemplo n.º 3
0
def clean_twitter(phrase):
    phrase = re.sub(r'(.)\1{2,}', r'\1\1', re.sub(r'[^\x00-\x7f]', ' ', phrase))
    if is_bad_word(phrase.lower()):
        return
    parts = en_nl.tokenize(phrase).split()
    for part in itertools.chain(parts, english_window(parts)):
        if part.startswith(('#', '@', 'http:')):
            yield part
        elif part.strip() and part != 'rt' and not en_nl.is_stopword(part):
            part = en_nl.normalize(part).strip('-')
            if part.strip():
                yield part
Ejemplo n.º 4
0
def clean_twitter(phrase):
    phrase = re.sub(r'(.)\1{2,}', r'\1\1', re.sub(r'[^\x00-\x7f]', ' ',
                                                  phrase))
    if is_bad_word(phrase.lower()):
        return
    parts = en_nl.tokenize(phrase).split()
    for part in itertools.chain(parts, english_window(parts)):
        if part.startswith(('#', '@', 'http:')):
            yield part
        elif part.strip() and part != 'rt' and not en_nl.is_stopword(part):
            part = en_nl.normalize(part).strip('-')
            if part.strip():
                yield part