This repository has been archived by the owner on Sep 23, 2022. It is now read-only.
/
tokenizer.py
99 lines (77 loc) · 2.35 KB
/
tokenizer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
import re
import jieba
URL_LIKE_RE = re.compile(ur'\b[a-z]+:/*[a-z0-9\-\._~:/\?#@!$&\+=]+', flags=re.IGNORECASE)
CJK_CHAR_RANGE = ur'\u3040-\u312f\u3200-\u32ff\u3400-\u4dbf\u4e00-\u9fff\uf900-\ufaff'
CJK_LEFT_RE = re.compile(u'([' + CJK_CHAR_RANGE + u'])([^' + CJK_CHAR_RANGE + u'\\s])')
CJK_RIGHT_RE = re.compile(u'([^' + CJK_CHAR_RANGE + u'\\s])([' + CJK_CHAR_RANGE + u'])')
CJK_SLUG_RE = re.compile(u'([' + CJK_CHAR_RANGE + u']+)')
ASCII_SLUG = ur'[a-z_][a-z0-9\-_\.]*'
ASCII_SLUG_RE = re.compile(u'[#@]?(' + ASCII_SLUG + u')', flags=re.IGNORECASE)
CONJUNCTIONS = None
def initialize():
# Load conjuction data
global CONJUNCTIONS
CONJUNCTIONS = []
from codecs import open
with open('vendor/moedict.dict', 'r', encoding='utf8') as data:
for entry in data:
CONJUNCTIONS.append(entry.split()[0])
# Load CJK parsing library
jieba.set_dictionary('vendor/jieba_tc.dict')
jieba.load_userdict('vendor/chewing.dict')
jieba.initialize()
# Add whitespace between CJK characters
def preprocess_cjk(text):
text = CJK_LEFT_RE.sub(r'\1 \2', text)
text = CJK_RIGHT_RE.sub(r'\1 \2', text)
return text
def preprocess_url(text):
text = URL_LIKE_RE.sub('', text)
return text
def preprocess_ascii(text):
text = ASCII_SLUG_RE.sub(r' \1 ', text)
return text
def preprocess(text):
# Ensure unicode string
try:
text = unicode(text, 'utf8')
except TypeError:
pass
text = preprocess_url(text)
text = preprocess_cjk(text)
text = preprocess_ascii(text)
return text
def tokenize(text):
tokens = []
text = preprocess(text)
tokens += ASCII_SLUG_RE.findall(text) # ASCII tokens are already usable
for unit in CJK_SLUG_RE.findall(text): # CJK tokens need extraction
# Search engine mode. Might return ambiguous result
unit_tokens = list(jieba.cut_for_search(unit))
# Make better word guessing by joining non-conjunction words
i = 0
length = len(unit_tokens)
while i < length:
j = i
buf = ''
while j < length:
token = unit_tokens[j]
if token in CONJUNCTIONS or len(token) > 1:
break
else:
buf += token
j += 1
if len(buf) > 1 and buf not in unit_tokens:
unit_tokens.append(buf)
i = j + 1
tokens.extend(unit_tokens)
return tokens
if __name__ == '__main__':
initialize()
from sys import stdin
while True:
try:
line = stdin.readline()
print '|'.join(tokenize(line))
except KeyboardInterrupt:
break