def _tokenize(text): """ Tokenizes string, does not consider $ when tokenizing""" tokenizer = TreebankWordTokenizer() tokenizer.PUNCTUATION = [ (re.compile(r'([:,])([^\d])'), r' \1 \2'), (re.compile(r'([:,])$'), r' \1 '), (re.compile(r'\.\.\.'), r' ... '), (re.compile(r'[;@#%&]'), r' \g<0> '), ( re.compile(r'([^\.])(\.)([\]\)}>"\']*)\s*$'), r'\1 \2\3 ', ), (re.compile(r'[?!]'), r' \g<0> '), (re.compile(r"([^'])' "), r"\1 ' "), ] return tokenizer.tokenize(text)
from nltk.tokenize import TreebankWordTokenizer import re t = TreebankWordTokenizer() t.PUNCTUATION = [ (re.compile(r'([:,])([^\d])'), r' \1 \2'), (re.compile(r'([\\/:,])$'), r' \1 '), (re.compile(r'\.\.\.'), r' ... '), (re.compile(r'[;@#$%&]'), r' \g<0> '), (re.compile(r'([^\.])(\.)([\]\)}>"\']*)\s*$'), r'\1 \2\3 '), (re.compile(r'[?!]'), r' \g<0> '), (re.compile(r"([^'])' "), r"\1 ' "), ] # token replacements, to attempt some kind of normalisation replacements = { "-lrb-": "(", "-rrb-": ")", "-lsb-": "[", "-rsb-": "]", "-lcb-": "{", "-rcb-": "}", "``": "\"", "“": "\"", "''": "\"", "”": "\"", "`": "'", "‘": "'", "’": "'", "---": "--",