forked from KA-Advocates/KATranslationCheck
/
AutoTranslationTranslator.py
executable file
·597 lines (546 loc) · 26.8 KB
/
AutoTranslationTranslator.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
#!/usr/bin/env python3
import re as re
from collections import Counter, defaultdict, namedtuple
from ansicolor import red
import os.path
import json
import itertools
import random
from toolz.dicttoolz import merge
from AutoTranslateCommon import *
from googletrans import Translator
class CompositeAutoTranslator(object):
"""
Utility that calls tries all autoindexers until one is able to translate
the string.
"""
def __init__(self, *args):
self.children = list(filter(lambda arg: arg is not None, args))
def translate(self, engl):
for child in self.children:
result = child.translate(engl)
if result is not None: # Could translate
return result
return None
class RuleAutotranslator(object):
"""
Auto-translates based on regex rules.
Will mostly auto-translate formula-only etc
"""
def __init__(self):
# Formulas:
# $...$
# **$...$
self._is_formula = re.compile(r"^(>|#)*[\s\*]*(\$[^\$]+\$(\s|\\n|\*)*)+$");
# contains a \text{ clause except specific text clauses:
# \text{ cm}
# \text{ m}
# \text{ g}
self._contains_text = get_text_regex()
# URLs:
# ![](web+graphie://ka-perseus-graphie.s3.amazonaws.com/...)
# web+graphie://ka-perseus-graphie.s3.amazonaws.com/...
# https://ka-perseus-graphie.s3.amazonaws.com/...png
self._is_perseus_img_url = re.compile(r"^(!\[\]\()?\s*(http|https|web\+graphie):\/\/ka-perseus-(images|graphie)\.s3\.amazonaws\.com\/[0-9a-f]+(\.(svg|png|jpg|jpeg))?\)?\s*$")
self._is_formula_plus_img = re.compile(r"^>?[\s\*]*(\$[^\$]+\$(\s|\\n|\*)*)+(!\[\]\()?\s*(http|https|web\+graphie):\/\/ka-perseus-(images|graphie)\.s3\.amazonaws\.com\/[0-9a-f]+(\.(svg|png|jpg))?\)?\s*$")
self._is_input = re.compile(r"^\[\[\s*☃\s*[a-z-]+\s*\d*\s*\]\](\s|\\n)*$", re.UNICODE)
self._is_formula_plus_input = re.compile(r"^(>|#)*[\s\*]*(\$[^\$]+\$(\s|\\n|\*)*)+=?\s*\[\[\s*☃\s*[a-z-]+\s*\d*\s*\]\](\s|\\n)*$", re.UNICODE);
self._is_simple_coordinate = re.compile(r"^[\[\(]-?\d+,-?\d+[\]\)]$")
def translate(self, engl):
is_formula = self._is_formula.match(engl) is not None
contains_text = self._contains_text.search(engl) is not None
is_perseus_img_url = self._is_perseus_img_url.match(engl) is not None
is_formula_plus_img = self._is_formula_plus_img.match(engl) is not None
is_formula_plus_input = self._is_formula_plus_input.match(engl) is not None
is_input = self._is_input.match(engl) is not None
is_simple_coordinate = self._is_simple_coordinate.match(engl) is not None
if is_perseus_img_url or is_formula_plus_img or is_input or is_formula_plus_input or is_simple_coordinate:
return engl
if is_formula and not contains_text:
return engl
class IFPatternAutotranslator(object):
"""
Ignore Formula pattern autotranslator
"""
def __init__(self, lang):
self.lang = lang
# Read patterns index
self.ifpatterns = read_ifpattern_index(lang)
self.texttags = read_texttag_index(lang)
# Compile regexes
self._formula_re = re.compile(r"\$[^\$]+\$")
self._img_re = get_image_regex()
self._text = get_text_content_regex()
def translate(self, engl):
# Normalize and filter out formulae with translatable text
normalized = self._formula_re.sub("§formula§", engl)
normalized = self._img_re.sub("§image§", normalized)
# Mathrm is a rare alternative to \\text which is unhanled at the moment
if "mathrm" in engl:
return None
# If there are any texts, check if we know how to translate
texttag_replace = {} # texttags: engl full tag to translated full tag
for text_hit in self._text.finditer(engl):
content = text_hit.group(2).strip()
if content in self.texttags:
# Assemble the correct replacement string
translated = text_hit.group(1) + self.texttags[content] + text_hit.group(3)
texttag_replace[text_hit.group(0)] = translated
else: # Untranslatable tag
return None # Cant fully translate this string
# Check if it matches
if normalized not in self.ifpatterns:
return None # Do not have pattern
transl = self.ifpatterns[normalized]
# Find formulae in english text
#
# Replace one-by-one
#
src_formulae = self._formula_re.findall(engl)
while "§formula§" in transl:
next_formula = src_formulae.pop(0) # Next "source formula"
transl = transl.replace("§formula§", next_formula, 1)
src_images = self._img_re.findall(engl)
while "§image§" in transl:
next_image = src_images.pop(0)[0] # Next "source image"
transl = transl.replace("§image§", next_image, 1)
# Translate text-tags, if any
for src, repl in texttag_replace.items():
# Safety: If there is nothing to replace, fail instead of
# failing to translate a text tag
if src not in transl:
print(red("Text-tag translation: Can't find '{}' in '{}'".format(
src, transl), bold=True))
return None
transl = transl.replace(src, repl)
return transl
class NameAutotranslator(object):
"""
Auto-translates based on regex rules.
Will mostly auto-translate formula-only etc
"""
def __init__(self, lang):
self.lang = lang
self._re1 = re.compile(r"^\s*Only\s+([A-Z][a-z]+)((\.|\s+|\\n)*)$")
self._re2 = re.compile(r"^\s*Neither\s+([A-Z][a-z]+)\s+nor\s+([A-Z][a-z]+)((\.|\s+|\\n)*)$")
self._re3 = re.compile(r"^\s*Either\s+([A-Z][a-z]+)\s+or\s+([A-Z][a-z]+)((\.|\s+|\\n)*)$")
self._re4 = re.compile(r"^\s*Both\s+([A-Z][a-z]+)\s+and\s+([A-Z][a-z]+)((\.|\s+|\\n)*)$")
self._re5 = re.compile(r"^\s*Neither\s+([A-Z][a-z]+)\s+nor\s+([A-Z][a-z]+)\s+are\s+correct((\.|\s+|\\n)*)$")
self._re6 = re.compile(r"^\s*Both\s+([A-Z][a-z]+)\s+and\s+([A-Z][a-z]+)\s+are\s+correct((\.|\s+|\\n)*)$")
self._re7 = re.compile(r"^\s*Yes,\s+([A-Z][a-z]+)\s+is\s+correct\s+but\s+([A-Z][a-z]+)\s+is\s+not((\.|\s+|\\n)*)$")
self._re8 = re.compile(r"^\s*In conclusion,\s*([A-Z][a-z]+)\s+is\s+correct((\.|\s+|\\n)*)$")
self._re9 = re.compile(r"^\s*Only\s*([A-Z][a-z]+)\s+is\s+correct((\.|\s+|\\n)*)$")
self._re10 = re.compile(r"^\s*([A-Z][a-z]+)'s\s+work\s+is\s+correct((\.|\s+|\\n)*)$")
# Translation patterns in this order:
# 1. Only <name1>
# 2. Neither <name1> nor <name2>
# 3. Either <name1> or <name2>
# 4. Both <name1> and <name2>
# 5. Neither <name1> nor <name2> are correct
# 6. Both <name1> and <name2> are correct
# 7. Yes, <name1> is correct but <name2> is not
# 8. In conclusion, <name1> is correct
# 9. Only <name1> is correct
# 10.<name1>'s work is correct
transmap = {
"sv-SE": [
"Endast <name1>",
"Varken <name1> eller <name2>",
"Antingen <name1> eller <name2>",
"Både <name1> och <name2>",
"Varken <name1> eller <name2> har rätt",
"Både <name1> och <name2> har rätt",
"Ja, <name1> har rätt men inte <name2>",
"Avslutningsvis, så har <name1> rätt",
"Endast <name1> har rätt",
"<name1>s lösning är rätt"
], "lol": [
"Only <name1>",
"Neither <name1> norz <name2>",
"Either <name1> or <name2>",
"Both <name1> and <name2>",
"Neither <name1> nor <name2> are correct",
"Both <name1> and <name2> are correct"
], "de": [
"Nur <name1>",
"Weder <name1> noch <name2>",
"Entweder <name1> oder <name2>",
"Sowohl <name1> als auch <name2>",
"Weder <name1> noch <name2> liegen richtig",
"Sowohl <name1> als auch <name2> liegt richig",
"Ja, <name1> liegt richtig, aber <name2> liegt falsch",
"Zusammenfassend liegt <name1> richtig",
"Nur <name1> liegt richtig",
"Die Lösung von <name1> ist korrekt"
], "hu": [
"Csak <name1>",
"Sem <name1> sem <name2>",
"<name1> is vagy <name2> is",
"<name1> is és <name2> is",
"Sem <name1> sem <name2> nem helyes",
"<name1> és <name2> is helyes",
"Igen, <name1> helyes, de <name2> nem helyes",
"Tehát <name1> helyes",
"Csak <name1> helyes",
"<name1> megoldása helyes"
]
}
if lang not in transmap:
raise "Please create name translation mapping for {}".format(lang)
self.transmap = transmap[lang]
def replace_name(self, lang, name):
"""
Get the localized replacement name
"""
# TODO not implemented
return name
def _translate_match_two_names(self, m, transmap_entry):
name1 = m.group(1)
name2 = m.group(2)
rest = m.group(3)
return transmap_entry.replace("<name1>", name1).replace("<name2>", name2) + rest
def _translate_match_one_name(self, m, transmap_entry):
if transmap_entry is None: # Unknown translation
return None # Cant translate
name1 = m.group(1)
rest = m.group(2)
return transmap_entry.replace("<name1>", name1) + rest
def translate(self, engl):
m1 = self._re1.match(engl)
m2 = self._re2.match(engl)
m3 = self._re3.match(engl)
m4 = self._re4.match(engl)
m5 = self._re5.match(engl)
m6 = self._re6.match(engl)
m7 = self._re7.match(engl)
m8 = self._re8.match(engl)
m9 = self._re9.match(engl)
m10 = self._re10.match(engl)
if m1:
return self._translate_match_one_name(m1, self.transmap[0])
elif m2:
return self._translate_match_two_names(m2, self.transmap[1])
elif m3:
return self._translate_match_two_names(m3, self.transmap[2])
elif m4:
return self._translate_match_two_names(m4, self.transmap[3])
elif m5:
return self._translate_match_two_names(m5, self.transmap[4])
elif m6:
return self._translate_match_two_names(m6, self.transmap[5])
elif m7:
return self._translate_match_two_names(m7, self.transmap[6])
elif m8:
return self._translate_match_one_name(m8, self.transmap[7])
elif m9:
return self._translate_match_one_name(m9, self.transmap[8])
elif m10:
return self._translate_match_one_name(m10, self.transmap[9])
PlaceholderInfo = namedtuple("PlaceholderInfo", [
"nPlaceholders", "replaceMap", "nAsterisks", "nNewlines", "nHashs", "nUnderscores"])
class FullAutoTranslator(object):
"""
Google translate based full auto translator
"""
def __init__(self, lang, limit=25):
self.lang = lang if lang != "lol" else "de" # LOL => translate to DE
# Generate nonce to fix some bad translations
self.nonce1 = random.randint(1000000, 9999999)
self.nonce2 = random.randint(1000, 9999)
#
# Pattern regexes
#
# <g id="continue">%1$s</g> or <g id="get_help_link">%2$s</g> misrecognized as
self._formula_re = re.compile(r"\s*(?<!\%[\dA-Za-z])\$(\\\$|[^\$])+\$\s*")
self._asterisk_re = re.compile(r"\s*\*+\s*")
self._underscore_re = re.compile(r"\s*_+\s*")
self._special_chars_re = re.compile(r"\s*[θ𝘹𝘺ƒ𝘢𝘣𝘶𝘯𝘥𝘬𝘍𝑥𝑦𝑚𝑏𝑒𝑟𝑔𝑡𝜇—≠ⁿˣ⋅]+\s*") # translate will fail for these
self._hash_re = re.compile(r"\s*#+\s*")
self._table_empty_re = re.compile(r"\s*:-:\s*")
self._newline_re = re.compile(r"\s*(\\n)+\s*")
self._index_placeholder_re = re.compile(r"\s*§(image|formula)§\s*")
self._input_re = re.compile(r"\s*\[\[☃\s+[a-z-]+\s*\d*\]\]\s*")
self._image_re = re.compile(r"\s*!\[([^\]]*)\]\(\s*(http|https|web\+graphie):\/\/ka-perseus-(images|graphie)\.s3\.amazonaws\.com\/[0-9a-f]+(\.(svg|png|jpg|jpeg))?\)\s*")
self._tag_re = re.compile(r"\s*</?\s*[a-z-]+\s*([a-z-]+=\"[^\"]+\"\s*)*\s*/?>\s*")
self._suburl_re = re.compile(r"\s*\[\**([^\]\*]+)\**\]\s*\(\s*[^\)]+\s*\)\s*")
self._code_re = re.compile(r"\s*```[^`]+```\s*")
self._entity_re = re.compile(r"\s*&[#0-9a-z]+;\s*")
self._kaplaceholder_re = re.compile(r"\s*\%\([^\)]+\)[a-zA-Z]\s*")
self._mobile_placeholder_re = re.compile(r"\s*\%[\dA-Za-z](\$[\dA-Za-z])?\s*")
#
# Blacklist regexes
#
self._text_re = re.compile(r"\\(text|mathrm|textit|textbf)\s*\{([^\}]+)\}")
self._start_whitespace_re = re.compile(r"^\s*")
self._end_whitespace_re = re.compile(r"\s*")
self.limit = limit
self.count = 0
self.dbgout = open("fullauto-dbg.txt", "w")
# Blacklisted (actually used in some strings): △☐☺▫
self.uchars = "■□▢▣▤▥▦▧▨▩▪▬▭▮▯▰▱▲▴▵▶▷▸▹►▻▼▽▾▿◀◁◂◃◄◅◆◇◈◉◊○◌◍◎●◐◑◒◓◔◕◖◗◘◙◚◛◜◝◞◟◠◡◢◣◤◥◧◨◩◪◫◬◭◮◯◰◱◲◳◴◵◶◷◸◹◺◻◼◽◿◾─━│┃┄┅┆┇┈┉┊┋┌┍┎┏┐┑┒┓└┕┖┗┘┙┚┛├┝┞┟┠┡┢┣┤┥┦┧┨┩┪┫┬┭┮┯┰┱┲┳┴┵┶┷┸┹┺┻┼┽┾┿╀╁╂╃╄╅╆╇╈╉╊╋╌╍╎╏═║╒╓╔╕╖╗╘╙╚╛╜╝╞╟╠╡╢╣╤╥╦╧╨╩╪╫╬╭╮╯╰╱╲╳╴╵╶╷╸╹╺╻╼╽╾╿▀▁▂▃▄▅▆▇█▉▊▋▌▍▎▏▐░▒▓▔▕▖▗▘▙▚▛▜▝▞▟☀☁☂☄★☆☇☈☉☊☋☌☍☎☏☑☒☓☔☕☖☗☘☙☚☛☜☝☞☟☠☡☢☣☤☥☦☧☨☩☪☫☬☭☮☯☰☱☲☳☴☵☶☷☸☹☻☼☽☾☿♀♁♂♃♄♅♆♇♈♉♊♋♌♍♎♏♐♑♒♓♔♕♖♗♘♙♚♛♜♝♞♟"
assert(" " not in self.uchars)
assert(len(set(list(self.uchars))) == len(self.uchars))
# Create map between placeholders. This is required for nested patterns.
self.protoPlaceholderToNumericPlaceholder = {
c: self.placeholder(i)
for i, c in enumerate(self.uchars)
}
def __del__(self):
self.dbgout.close()
def proto_placeholder(self, n):
return self.uchars[n]
def placeholder(self, n):
#return self.uchars[n]
return "{}{}{}".format(self.nonce1, n, self.nonce2)
def placeholder_replace(self, s, n, regex, subtrans_groupno=None):
repmap = {}
while True:
match = regex.search(s)
if match is None: # No more formulas
break
formula = match.group(0)
current_placeholder = self.proto_placeholder(n)
# Subtranslate
if subtrans_groupno is not None:
subgroup = match.group(subtrans_groupno)
# Extract whitespace before and after
ws_before = self._start_whitespace_re.match(subgroup).group(0)
ws_after = self._end_whitespace_re.match(subgroup).group(0)
# Subtranslate. Strip whitespaces to re-insert the correct amount later
trans = self.google_translate(subgroup).strip()
trans = "{}{}{}".format(ws_before, trans, ws_after)
formula = formula.replace(subgroup, trans)
#print("Subgroup translation: {} --> {}".format(match.group(0), formula))
# Add into map
repmap[current_placeholder] = formula
# Add spaces before and after placeholder to separate from other elements of text
s = regex.sub(current_placeholder, s, count=1)
n += 1
return s, repmap, n
def final_replace(self, s, n):
"""
Replace proto placeholders by final placeholders
"""
for i in range(n):
s = s.replace(self.proto_placeholder(i), " {} ".format(self.placeholder(i)))
return s
def first_stage_backreplace(self, s, repmap):
"""
Replace proto placeholders by final placeholders
"""
for protoPlaceholder, _ in repmap:
# Get numeric placeholder
placeholder = self.protoPlaceholderToNumericPlaceholder[protoPlaceholder]
# Check if it got mis-translated...
if placeholder not in s:
# Special case for nested patterns:
# Nested patterns will not be replaced by 2nd stage (numeric) placeholders
is_nested = False
for _, val in repmap:
if protoPlaceholder in val: # Its nested in SOME pattern
is_nested = True
break
if is_nested:
continue # no need to replace numeric by proto pattern
else: # not nested, fail!
print(red("{} not found in '{}'".format(placeholder, s), bold=True))
return None
if s.count(placeholder) > 1:
print(red("Placeholder {} was duplicated in '{}'".format(placeholder, s), bold=True))
return None
# Replace by proto-placeholder which is a unicode char
s = re.sub(r"\s*" + placeholder + r"\s*",
protoPlaceholder, s, flags=re.UNICODE)
return s
def check_no_placeholders_present(self, s):
for c in self.uchars:
if c in s:
print(red("Found placeholder {} in '{}'".format(c, s), bold=True))
return False
return True
def combo_count(self, s, char):
return [s.count(char * n) for n in range(1, 10)]
def back_replace(self, s, repmap):
"""
Like simple_replace, but replaces
"""
for placeholder, rep in repmap:
s = s.replace(placeholder, rep)
return s
def preproc(self, s, subtranslate=True):
"""
Forward-replace first with proto-placeholders to avoid impacting
As proto-placeholders are unicode chars and will often be touched by the translator,
we then replace them by final numeric code placeholders with whitespace
added before and after which are not touched.
"""
n = 0
# \\n or might be directly followed by a word character and might be screwed up
# We count their number of newline combos now to check restoration later.
nAsterisks = self.combo_count(s, "*")
nNewlines = self.combo_count(s, "\\n")
nHashs = self.combo_count(s, "#")
nUnderscores = self.combo_count(s, "_")
s, indexPlaceholderMap, n = self.placeholder_replace(s, n, self._index_placeholder_re)
# Subtranslate URL title
s, sublurlMap, n = self.placeholder_replace(s, n, self._suburl_re,
subtrans_groupno=1 if subtranslate else None)
s, textMap, n = self.placeholder_replace(s, n, self._text_re,
subtrans_groupno=2 if subtranslate else None)
# Whitespace before and after is relevant for \\text{...}.
s, specialCharsMap, n = self.placeholder_replace(s, n, self._special_chars_re)
s, kaPlaceholderMap, n = self.placeholder_replace(s, n, self._kaplaceholder_re)
s, entityMap, n = self.placeholder_replace(s, n, self._entity_re)
s, tableEmptyMap, n = self.placeholder_replace(s, n, self._table_empty_re)
s, mobilePlaceholderMap, n = self.placeholder_replace(s, n, self._mobile_placeholder_re)
s, formulaMap, n = self.placeholder_replace(s, n, self._formula_re)
s, asteriskMap, n = self.placeholder_replace(s, n, self._asterisk_re)
s, underscoreMap, n = self.placeholder_replace(s, n, self._underscore_re)
s, hashMap, n = self.placeholder_replace(s, n, self._hash_re)
s, newlineMap, n = self.placeholder_replace(s, n, self._newline_re)
s, inputMap, n = self.placeholder_replace(s, n, self._input_re)
s, imgMap, n = self.placeholder_replace(s, n, self._image_re)
# Code before tag as code might contain tag
s, codeMap, n = self.placeholder_replace(s, n, self._code_re)
s, tagMap, n = self.placeholder_replace(s, n, self._tag_re)
repmap = list(itertools.chain(*[
specialCharsMap.items(),
indexPlaceholderMap.items(),
sublurlMap.items(),
textMap.items(),
underscoreMap.items(),
tableEmptyMap.items(),
kaPlaceholderMap.items(),
entityMap.items(),
mobilePlaceholderMap.items(),
formulaMap.items(),
asteriskMap.items(),
hashMap.items(),
newlineMap.items(),
inputMap.items(),
imgMap.items(),
codeMap.items(),
tagMap.items()
]))[::-1]
# Final placeholder replacement
s = self.final_replace(s, n)
return s, PlaceholderInfo(n, repmap, nAsterisks, nNewlines, nHashs, nUnderscores)
def postproc(self, engl, s, info):
"""
Back-replace placeholders
"""
# Replace numeric placeholders by unicode placeholders
# This prevents spaces between placeholders cross-affecting each other
s = self.first_stage_backreplace(s, info.replaceMap)
if s is None: # Placeholder missing or changed
return None
# Replace unicode placeholders by their original value
s = self.back_replace(s, info.replaceMap)
# Now no placeholders should be left
if not self.check_no_placeholders_present(s):
return None
#
# Check if combinations match
#
nAsterisksNew = self.combo_count(s, "*")
if nAsterisksNew != info.nAsterisks:
print(red("* not reconstructed in '{}' engl '{}'".format(s, engl), bold=True))
return None
nNewlinesNew = self.combo_count(s, "\\n")
if nNewlinesNew != info.nNewlines:
print(red("\\n not reconstructed in '{}' engl '{}'".format(s, engl), bold=True))
return None
nUnderscoresNew = self.combo_count(s, "_")
if nUnderscoresNew != info.nUnderscores:
print(red("_ not reconstructed in '{}' engl '{}'".format(s, engl), bold=True))
return None
return s
def google_translate(self, txt):
translator = Translator()
#translate_client = translate.Client()
#translation = translate_client.translate( txt, target_language=lang)
#return translation['translatedText']
# partition: sv-SE => sv
result = translator.translate(txt, src="en", dest=self.lang.partition("-")[0])
return result.text
def check_regex_equal(self, regex, s1, s2, desc):
m1 = [m.group(0).strip() for m in regex.finditer(s1)]
m2 = [m.group(0).strip() for m in regex.finditer(s2)]
if m1 != m2:
print(red("Syntax comparison failed for {} regex:\n\t{}\n\t{}".format(
desc, str(m1), str(m2)), bold=True))
print(red("Original: {}".format(s1), bold=True))
print(red("Translated: {}".format(s2), bold=True))
return False
return True
def translate(self, engl):
if engl is None:
return None
# Use limit on how much to translate at once
if self.limit <= 0:
return None # dont translate
# Check if there are any placeholder-type characters in the string
if not self.check_no_placeholders_present(engl):
return None
# Replace formulas etc. by placeholders.
# Subtranslation will fail back verification so we'll do it later
engl_proc, info = self.preproc(engl, subtranslate=False)
# Check validity of placeholders (should yield original string)
test_postproc = self.postproc(engl, engl_proc, info)
if test_postproc != engl:
print(red("Validation reproduction failed: '{}' instead of '{}'".format(test_postproc, engl)))
return None
# Do actual preprocessing with possible subtranslation
engl_proc, info = self.preproc(engl, subtranslate=True)
# Perform translation
translated = self.google_translate(engl_proc)
# Back-replace placeholders
txt2 = self.postproc(engl, translated, info)
# Emit debug data
print("{", file=self.dbgout)
print("\tEngl:",engl, file=self.dbgout)
print("\tMap:",info.replaceMap, file=self.dbgout)
print("\tPreproc:", engl_proc, file=self.dbgout)
print("\tTranslated:", translated, file=self.dbgout)
print("\tResult:", txt2, file=self.dbgout)
print("}", file=self.dbgout)
# Syntax equivalence check.
# Ignores whitespace as it will happen for various languages due to grammatics
if txt2 is None:
return None
# disabled as it fails for text subtrans
#if not self.check_regex_equal(self._formula_re, engl, txt2, "formula"):
# return None
if not self.check_regex_equal(self._asterisk_re, engl, txt2, "asterisk"):
return None
if not self.check_regex_equal(self._entity_re, engl, txt2, "enttiy"):
return None
if not self.check_regex_equal(self._newline_re, engl, txt2, "newline"):
return None
if not self.check_regex_equal(self._input_re, engl, txt2, "input"):
return None
# disabled as URL subtrans will cause it to fail
#if not self.check_regex_equal(self._image_re, engl, txt2, "image"):
# return None
if not self.check_regex_equal(self._tag_re, engl, txt2, "tag"):
return None
if not self.check_regex_equal(self._code_re, engl, txt2, "code"):
return None
if not self.check_regex_equal(self._kaplaceholder_re, engl, txt2, "KA placeholder"):
return None
if not self.check_regex_equal(self._mobile_placeholder_re, engl, txt2, "KA mobile placeholder"):
return None
# Reduce limit only after successful translation
self.limit -= 1
self.count += 1
if self.count % 100 == 0:
print("Beastified {} strings".format(self.count))
return txt2
if __name__ == "__main__":
import argparse
parser = argparse.ArgumentParser()
parser.add_argument('string', help='The string to translate')
parser.add_argument('-l', '--lang', default="de", help='The language to translate to')
args = parser.parse_args()
fa = FullAutoTranslator(args.lang)
print(fa.translate(args.string))