/
helpers.py
331 lines (272 loc) · 11.1 KB
/
helpers.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
import dict_api
import re
import logging
ENGLISH_LETTERS = set(['a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s','t','v','u','w','x',
'y','z'])
RUSSIAN_LETTERS = set(['а','б','в','г','д','е','ё','ж','з','и','й','к','л','м','н','о','п','р','с','т','у','ф','х','ц',
'ч','ш','щ','ъ','ы','ь','э','ю','я'])
def get_popular_word(filename):
# TODO check file for exist
f = open(filename, 'r')
s = f.readline()
result = set(s[:-1])
while s:
s = f.readline()
if s:
result.add(s[:-1])
return result
def save_text_to_file(text, filename):
with open(filename, 'w') as f:
f.write(text)
def is_string_good(line, delimiter):
if not line:
return False
if len(line.split(delimiter)) != 2:
return False
return True
def make_beauty_1_word(words_dict):
# input: {"ts": ts, "pos": pos, "word": word }
# output: <span style="color: #aaa;">(adv.)</span> actually<br/><span style="color: #666;">/ˈæˌktʃuəli, ˈæktʃli, ˈækʃəli/</span> на самом деле
result = '<span style="color: #aaa;">{pos}</span> {word}<br/><span style="color: #666;">{ts}</span>{tab}{rus}'
pos = ts = ""
if words_dict["pos"]:
pos = "(" + words_dict["pos"] + ")"
if words_dict["ts"].strip():
ts = "/ " + words_dict["ts"] + " /"
return result.format(pos=pos, word="{eng}", ts=ts, rus="{rus}", tab="{tab}")
def make_beauty_some_words(words_dict, pos):
result = '<span style="color: #aaa;">{pos}</span> {word}<br/><span style="color: #666;">{ts}</span>{tab}{rus}'
ts = "/"
if pos:
pos = "(" + pos + ")"
for word in words_dict:
if word["ts"].strip() != "":
ts = ts + " " + word["ts"] + " "
ts += "/"
if ts == "//":
ts = ""
return result.format(pos=pos, word="{eng}", ts=ts, rus="{rus}", tab="{tab}")
def handle_line(line):
"""
>>> handle_line("● Blow away the cobwebs - прогуляться, проветриться")
'Blow away the cobwebs - прогуляться, проветриться'
>>> handle_line("verb [vɜːb] - глагол")
'verb - глагол'
>>> handle_line("verb - глагол(pew) ")
'verb - глагол'
>>> handle_line("fall down – 1) пасть ниц; 2) потерпеть неудачу")
'fall down – 1) пасть ниц; 2) потерпеть неудачу'
:param line:
:return handled line:
"""
# remove [] and ()
while ("[" in line and "]" in line) or ("(" in line and ")" in line):
cutted_line = re.split("\[*\]*", line)
if len(cutted_line) > 1:
del cutted_line[1]
line = " ".join(cutted_line)
cutted_line = re.split("\(*\)*", line)
if len(cutted_line) > 1:
del cutted_line[1]
line = " ".join(cutted_line)
for i in range(len(line)):
if line[i].lower() in ENGLISH_LETTERS or line[i].lower() in RUSSIAN_LETTERS:
break
line = line[i:]
return line.strip()
def handle_words(words):
assert type(words) == list
assert len(words) > 1
words_external = []
for word in words:
if len(word) > 2:
words_external.append(dict_api.lookup_ts_pos_force(word))
pos = ""
for word in words_external:
if word["pos"]:
pos = word["pos"]
break
for word in words_external:
if word["pos"] and word["pos"] != pos:
pos = ""
break
return make_beauty_some_words(words_external, pos)
def handle_words_or_word(words):
assert type(words) == list
if len(words) == 0:
return ""
if len(words) == 1:
ts_pos = dict_api.lookup_ts_pos_force(words[0])
return make_beauty_1_word(ts_pos)
else:
return handle_words(words)
def find_delimiter(text, dash_types=False):
"""
>>> find_delimiter("resume - резюме")
' - '
>>> find_delimiter("occupation — занятие")
' — '
>>> find_delimiter('unemployed / jobless / out-of-work / man out of occupation - безработный')
' - '
>>> find_delimiter("lose (lost, lost) one's job - потерять работу")
' - '
>>> find_delimiter('lump-sum allowance - единовременное пособие')
' - '
>>> find_delimiter('lump-sum allowance — единовременное пособие')
' — '
>>> find_delimiter("43. What's the idea of - В чём смысл, что за глупость -")
' - '
>>> find_delimiter('Мне нужно лекарство от простуды. — I need a cold medicine.')
' — '
:param text:
:return dash:
"""
# in PyCharm they looks similar, but they are differ
if not dash_types:
dash_types = [' — ', ' - ', ' - ']
# current dash is right if it's relevant for more then koeff*100% strings.
koeff = 0.5
for dash in dash_types:
all_line = 0
suitable_for_this_delimiter = 0
for line in text.split("\n"):
if line.strip() != "":
all_line += 1
if len(line.split(dash)) == 2:
l_part, r_part = line.split(dash)
if is_english(l_part) and is_russian(r_part) or is_russian(l_part) and is_english(r_part):
suitable_for_this_delimiter += 1
if float(suitable_for_this_delimiter)/all_line > koeff:
return dash
# logging.error("delimiter not found. text:" + text)
return False
def find_delimiter_euristic(text, max_delimiter_size=2, stake_for_approve_delimiter=0.7):
"""
:param text:
:return delimiter:
"""
handled_text = []
possible_delimiter = []
for line in text.split("\n"):
if line.strip() != "" and re.search("[a-zA-Z]+", line) and re.search("[а-яА-Я]+", line):
handled_text.append(line)
for line in handled_text:
possible_delimiter.append(find_delimiter_euristic_line(line))
delimiters_dict = {}
for d in possible_delimiter:
if d not in delimiters_dict.keys():
delimiters_dict[d] = 1
else:
delimiters_dict[d] += 1
delimiter_amount_max = 0
delimiter_most_popular = ""
for delimiter, delimiter_amount in delimiters_dict.items():
if delimiter_amount > delimiter_amount_max:
delimiter_amount = delimiter_amount_max
delimiter_most_popular = delimiter
# если длина делимитера >2, то вероятно он определился не так
# вместо =, определился ...=
if len(delimiter_most_popular) > max_delimiter_size:
logging.info("Delimiter too long:{}".format(delimiter_most_popular))
short_possible_delimiter = set()
# отбираем возможные
for delimiter in possible_delimiter:
if len(delimiter) <= max_delimiter_size:
short_possible_delimiter.add(delimiter)
for pretendent_delimiter in short_possible_delimiter:
relevant = 0
for delimiter in possible_delimiter:
if pretendent_delimiter in delimiter:
relevant += 1
if relevant/len(possible_delimiter) > stake_for_approve_delimiter:
logging.info("I think this delimiter better:{}".format(pretendent_delimiter))
return pretendent_delimiter
return delimiter_most_popular.strip()
def find_delimiter_euristic_line(line):
"""
>>> find_delimiter_euristic_line("startle at smth– вздрогнуть от чего-то")
'–'
>>> find_delimiter_euristic_line(" pew/PEW-wow @ пышь/ПЫШЬ")
'@'
>>> find_delimiter_euristic_line(" пышь/ПЫШЬ @ pew/PEW-wow ")
'@'
>>> find_delimiter_euristic_line("superior — превосходный")
'—'
>>> find_delimiter_euristic_line("lovey-dovey — шаловливый, игривый, влюбленный (ame)")
problem with euristic detection delimiter in line: lovey-dovey — шаловливый, игривый, влюбленный (ame)
''
:param line:
:return:
"""
finish_1st_part = 0
start_2nd_part = 0
line = line.strip().lower()
# Eng -> Rus
if line[0] in ENGLISH_LETTERS:
first_languages_letters = ENGLISH_LETTERS
second_langueage_letters = RUSSIAN_LETTERS
else:
first_languages_letters = RUSSIAN_LETTERS
second_langueage_letters = ENGLISH_LETTERS
for letter_no in range(len(line)):
if line[letter_no] in first_languages_letters:
finish_1st_part = letter_no
if line[letter_no] in second_langueage_letters and not start_2nd_part:
start_2nd_part = letter_no
if start_2nd_part <= finish_1st_part:
print("problem with euristic detection delimiter in line: {}".format(line))
return ""
delimiter = line[finish_1st_part+1:start_2nd_part].strip()
return delimiter
def is_english(text):
"""
>>> is_english('abc')
True
>>> is_english('unemployed / jobless / out-of-work / man out of occupation - ')
True
>>> is_english('43. Whats the idea of - В чём смысл, что за глупость -')
False
:param text:
:return boolean:
"""
if re.findall('[а-яА-Я]+', text):
return False
else:
return True
def is_russian(text):
"""
>>> is_russian("абв")
True
>>> is_russian('43. Whats the idea of - В чём смысл, что за глупость -')
False
>>> is_russian("Хотите за 3 минуты узнать свой словарный запас английских слов? ")
True
:param text:
:return boolean:
"""
if re.findall('[a-zA-Z]+', text):
return False
else:
return True
if __name__ == "__main__":
# dt = {"ts": "ˈæˌktʃuəli, ˈæktʃli, ˈækʃəli", "pos": "adv.", "word": "actually" }
# print(make_beauty_1_word(dt))
# words = [{"ts": "ˈkɒrən(ə)rɪ", "pos": "noun.", "word": "coronary" },
# {"ts": "ˈɑːtərɪ", "pos": "noun", "word": "artery"},
# {"ts": "veɪn", "pos": "", "word": "vein"}]
# # print(make_beauty_some_words(words, "noun."))
# # print(handle_words(["bronchi", "artery", "vein"]))
# print(is_english("abя"))
# test euristic_delimiter_finder
text = """СТРАХ
feel sick at smth – слабеть при виде чего-то
pallid at smth – побледневший от чего-то
startle at smth– вздрогнуть от чего-то
aghast at smth – пораженный ужасом при виде
appalled at / with smth – устрашенный чем-то
dismayed at / with smth – приведенный в ужас чем-то
frightened at smth – испуганный чем-то
horrified at smth – в ужасе от чего-то
"""
print( find_delimiter_euristic(text))
print (find_delimiter_euristic_line("startle at smth– вздрогнуть от чего-то"))