Beispiel #1
0
def replace(text, repls):
    pattern = replacer.prepare(repls)
    return replacer.apply(pattern, text)
Beispiel #2
0
'''

import re
import sys
from denis.common import util
from denis.common.replacer import replacer

_to_remove = [
    '.', ',', '!', '?', ':', ';', '>', '<',
    '"', "'", '(', ')', '{', '}', '[', ']',
    '\\', '--', '`',
]
_to_substitute = util.flatten([_to_remove, [
    '-'
]])
_removal_pattern = replacer.prepare(_to_remove, onlyAtEnds=True)
_substitution_pattern = replacer.prepare(_to_substitute, onlyAtEnds=False)

_digit_normalizers = { 
    r'^[0-9]{1,}(\.[0-9]{1,}){0,1}$': '[DIGITS]', 
    r'^\$[0-9]{1,}(\.[0-9]{1,}){0,1}$': '[MONEY]'
}

def tokenize(line, clean=True, tolower=True, splitwords=False):
    tokens = line.strip().split()
    if clean:
        cleanTokens = []
        for token in tokens:
            token = token.strip()
            # only force UTF-8 encoding if still in Python 2
            if sys.version[0] == '2':