forked from zhujinliang/chinesetokenization
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Segment.py
275 lines (245 loc) · 10.3 KB
/
Segment.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
#!/usr/bin/python
# -*- coding: utf-8 -*-
__author__ = 'zdj'
from Node import *
from pro_dict import *
import re
import datetime
class Segment(object):
def __init__(self):
self.node_list = {} # The nodes has been segment
self.sentences = [] # short sentences
self.current_index = 0
self.temp_node_list = []
self.pro_dictionary = None
self.graph_nodes_list = []
self.debug = False
self.separator = u' '
def cut_into_short_sentence(self, input_stc):
re_biaodian = re.compile(ur'[\u2014-\u2026\u3000-\u303F\uff01-\uff0c\uff1a-\uff1f]')
line = unicode(input_stc, 'utf-8')
line = line.strip()
self.sentences = re_biaodian.split(line)
for sen in self.sentences:
if sen is u'':
self.sentences.remove(sen)
def create_new_connected_node(self, word, pre_node, next_node):
node = Node(word, None)
pre_node.add_next_node(node)
node.add_pre_node(pre_node)
next_node.add_pre_node(node)
node.add_next_node(next_node)
return node
def construct_token_graph(self, short_sentence):
# short_sentence is not start with 's' and end with 'e'
num = len(short_sentence)
num += 1
self.temp_node_list = []
self.graph_nodes_list = []
for i in range(0, num, 1):
node = Node(None, None)
self.temp_node_list.append(node)
for i in range(0, num - 1, 1):
word = short_sentence[i]
node = self.create_new_connected_node(word, self.temp_node_list[i], self.temp_node_list[i + 1])
self.graph_nodes_list.append(node)
i = 0
j = i + 2
while i < num - 1:
word = short_sentence[i:j]
if self.pro_dictionary.has_vocable(word):
if self.debug:
print 'In Dictionary: ' + i.__str__() + ' ' + j.__str__() + ' ' + word
node = self.create_new_connected_node(word, self.temp_node_list[i], self.temp_node_list[j])
self.graph_nodes_list.append(node)
if j < num - 1:
j += 1
else:
i += 1
j = i + 2
if j >= num - 1:
break
else:
# word is not in the dictionary go to the next word
# j is at the end of the sentence
if j >= num - 1 or j - i >= self.pro_dictionary.get_longest_length():
i += 1
j = i + 2
if j > num - 1:
break
else:
if j < num - 1:
j += 1
else:
# when the i point to the last character of the sentence break
i += 1
j = i + 2
if j > num - 1:
break
root = self.temp_node_list[0]
root.current_token = 's'
end = self.temp_node_list[num - 1]
end.current_token = 'e'
for i in range(1, num - 1, 1):
current_node = self.temp_node_list[i]
self.connect_token_node(current_node)
return root, end
def connect_token_node(self, current_node):
for pre_node in current_node.pre_nodes:
pre_node.next_nodes = current_node.next_nodes[:]
if len(current_node.next_nodes) is 1 and current_node.next_nodes[0].current_token is 'e':
end = current_node.next_nodes[0]
end.pre_nodes.remove(current_node)
for pre in current_node.pre_nodes:
end.pre_nodes.append(pre)
else:
for next_node in current_node.next_nodes:
next_node.pre_nodes = current_node.pre_nodes[:]
def is_pre_node(self, node, pre_node):
if pre_node.current_token == node.pre_token:
return True
return False
def construct_three_token_graph_phase_1(self, node):
# add a new node to replace the original edge between two nodes
# not the end node of the graph
if not node.current_token == 'e':
num = len(node.next_nodes)
for i in range(0, num, 1):
next_node = node.next_nodes[i]
if next_node.pre_token is not None or next_node.current_token is 'e':
continue
else:
token = next_node.current_token
new_node = Node(token, node.current_token)
new_node.pre_nodes.append(node)
new_node.next_nodes.append(next_node)
node.next_nodes[i] = new_node
node_index = next_node.pre_nodes.index(node)
next_node.pre_nodes[node_index] = new_node
self.construct_three_token_graph_phase_1(next_node)
else:
return
def construct_three_token_graph_phase_2(self):
# remove the single token node
for node in self.graph_nodes_list:
self.connect_token_node(node)
def construct_three_lan_model_token_graph(self, node):
if len(node.next_nodes) == 0:
return
n = len(node.pre_nodes)
if n == 1:
node.pre_token = node.pre_nodes[0].current_token
for next_node in node.next_nodes:
self.construct_tree_lan_modle_token_graph(next_node)
else:
split_nodes = []
for pre_node in node.pre_nodes:
has_node = False
for split_new_node in split_nodes:
if self.is_pre_node(split_new_node, pre_node):
pre_node.next_nodes.remove(node)
pre_node.next_nodes.append(split_new_node)
has_node = True
break
if not has_node:
new_node = Node(node.current_token, pre_node.current_token)
new_node.pre_nodes.append(pre_node)
new_node.next_nodes = node.next_nodes[:]
pre_node.next_nodes.remove(node)
pre_node.next_nodes.append(split_new_node)
del node
for new_node in split_nodes:
for next_node in new_node.next_nodes:
self.construct_tree_lan_modle_token_graph(next_node)
def find_max_path(self, root):
if len(root.next_nodes) == 0:
return
for child_node in root.next_nodes:
child_node.set_best_pre_node(self.pro_dictionary, root)
if child_node.hasPass:
self.find_max_path(child_node)
def final_token_path(self, node):
if node.current_token == 'e':
node = node.best_pre_node
self.final_token_path(node)
elif node.current_token == 's':
return
else:
self.result_token.append(node.current_token)
node = node.best_pre_node
self.final_token_path(node)
def scan_sentence_for_result(self, tokens, sentence):
sentence = unicode(sentence, 'utf-8')
sentence = sentence.strip()
start_index = 0
re_biaodian = re.compile(ur'[\u2014-\u2026\u3000-\u303F\uff01-\uff0c\uff1a-\uff1f]')
i = 0
while i < len(tokens):
l = len(tokens[i])
word = sentence[start_index:start_index + l]
# if word == u',':
# pass
# if tokens[i] == u'如果':
# pass
if not word == tokens[i]:
if re_biaodian.match(sentence[start_index]):
tokens.insert(i, sentence[start_index])
else:
print'Error not match character'
start_index += 1
else:
start_index += l
i += 1
n = len(sentence)
if start_index < n:
word = sentence[start_index:n]
tokens.append(word)
return tokens
def segment(self, sentences):
seg_sentence = []
for sentence in sentences:
tokens = []
self.cut_into_short_sentence(sentence)
for short_sentence in self.sentences:
start_time_1 = datetime.datetime.now()
root, end = self.construct_token_graph(short_sentence)
start_time_2 = datetime.datetime.now()
self.construct_three_token_graph_phase_1(root)
start_time_3 = datetime.datetime.now()
self.construct_three_token_graph_phase_2()
start_time_4 = datetime.datetime.now()
root.max_probability = 1
self.find_max_path(root)
start_time_5 = datetime.datetime.now()
self.result_token = []
self.final_token_path(end)
start_time_6 = datetime.datetime.now()
self.result_token.reverse()
tokens.__iadd__(self.result_token)
if self.debug:
print 'construct_token_graph time: ' + (start_time_2 - start_time_1).microseconds.__str__()
print 'construct_three_token_graph_phase_1: ' + (start_time_3 - start_time_2).microseconds.__str__()
print 'construct_three_token_graph_phase_2: ' + (start_time_4 - start_time_3).microseconds.__str__()
print 'find_max_path: ' + (start_time_5 - start_time_4).microseconds.__str__()
print 'final_token_path: ' + (start_time_6 - start_time_5).microseconds.__str__()
print u'/'.join(self.result_token)
tokens = self.scan_sentence_for_result(tokens, sentence)
result_str = self.separator.join(tokens)
seg_sentence.append(result_str)
return seg_sentence
if __name__ == '__main__':
input_file = open('input_sentence.txt')
sens = input_file.readlines()
print sens[0]
# re_biaodian = re.compile(ur'[\u2014-\u2026\u3000-\u303F\uff01-\uff0c\uff1a-\uff1f]')
# line=u'你 好,::再见'
# sens=re_biaodian.split(line)
# for s in sens:
# print s
# print sens
seg = Segment()
pro_dic = ProDict()
seg.pro_dictionary = pro_dic
results=seg.segment(sens)
for result in results:
print result