def plurr_placeholders(self, str1, str2, **kwargs): """For plurr-formatted strings, checks placeholders used in target strings actually exist in the source string. """ if str2 == u"" or not plurr_placeholders_regex.search(str1): return True placeholders_source = [ clean_plurr_placeholder(source) for source in plurr_placeholders_regex.findall(str1) ] placeholders_target = [ clean_plurr_placeholder(target) for target in plurr_placeholders_regex.findall(str2) ] if set(placeholders_source) == set(placeholders_target): return True unknown_in_target = set(placeholders_target) - set(placeholders_source) if len(unknown_in_target) > 0: raise checks.FilterFailure( u"Unknown placeholders in translation: %s" % u", ".join(unknown_in_target)) missing_in_translation = set(placeholders_source) - set( placeholders_target) if len(missing_in_translation) > 0: raise checks.FilterFailure( u"Placeholders missing in translation: %s" % u", ".join(missing_in_translation)) return True
def unbalanced_tag_braces(self, str1, str2): def get_fingerprint(str, is_source=False, translation=''): chunks = unbalanced_tag_braces_regex.split(str) translate = False level = 0 for chunk in chunks: translate = not translate if translate: # ordinary text (safe to translate) continue # special text if level >= 0: if chunk == '<': level += 1 if chunk == '>': level -= 1 return level if check_translation(get_fingerprint, str1, str2): return True else: raise checks.FilterFailure(u"Unbalanced tag braces")
def whitespace(self, str1, str2): def get_fingerprint(str, is_source=False, translation=''): chunks = whitespace_regex.split(str) translate = False fp_data = [u"\001"] for chunk in chunks: translate = not translate # add empty chunk to fingerprint data to detect begin or # end whitespaces if chunk == u'': fp_data.append(chunk) if translate: # ordinary text (safe to translate) continue # special text fp_data.append(chunk) fingerprint = u"\001".join(fp_data) return fingerprint if check_translation(get_fingerprint, str1, str2): return True else: raise checks.FilterFailure(u"Incorrect whitespaces")
def _generic_check(str1, str2, regex, message): def get_fingerprint(str, is_source=False, translation=''): chunks = regex.split(str) translate = False d = {} fingerprint = '' if is_source and len(chunks) == 1: raise SkipCheck() for chunk in chunks: translate = not translate if translate: # ordinary text (safe to translate) continue # special text if chunk in d: d[chunk] += 1 else: d[chunk] = 1 for key in sorted(d.keys()): fingerprint += u"\001%s\001%s" % (key, d[key]) return fingerprint if check_translation(get_fingerprint, str1, str2): return True else: raise checks.FilterFailure(message)
def unbalanced_curly_braces(self, str1, str2): def get_fingerprint(str, is_source=False, translation=''): chunks = unbalanced_curly_braces_regex.split(str) translate = False count = 0 level = 0 for chunk in chunks: translate = not translate if translate: # ordinary text (safe to translate) continue # special text count += 1 if level >= 0: if chunk == '{': level += 1 if chunk == '}': level -= 1 fingerprint = u"%d\001%d" % (count, level) # if source string has unbalanced tags, always report it if is_source and not level == 0: # just make the fingerprint different by one symbol fingerprint += u"\001" return fingerprint if check_translation(get_fingerprint, str1, str2): return True else: raise checks.FilterFailure(u"Unbalanced curly braces")
def changed_attributes(self, str1, str2): def get_fingerprint(str, is_source=False, translation=''): # hardcoded rule: skip web banner images which are translated # differently if is_source: if img_banner_regex.match(str): raise SkipCheck() chunks = changed_attributes_regex.split(str) translate = False fingerprint = '' d = {} for chunk in chunks: translate = not translate if translate: # ordinary text (safe to translate) continue # special text if chunk in d: d[chunk] += 1 else: d[chunk] = 1 for key in sorted(d.keys()): fingerprint += u"\001%s\001%s" % (key, d[key]) return fingerprint if check_translation(get_fingerprint, str1, str2): return True else: raise checks.FilterFailure(u"Changed attributes")
def doublequoting(self, str1, str2): """Checks whether there is no double quotation mark `"` in source string but there is in a translation string. """ def get_fingerprint(str, is_source=False, translation=''): chunks = str.split('"') if is_source and '"' in str: raise SkipCheck() translate = False double_quote_count = 0 for chunk in chunks: translate = not translate if translate: # ordinary text (safe to translate) continue double_quote_count += 1 fingerprint = u"%d\001" % double_quote_count return fingerprint if check_translation(get_fingerprint, str1, str2): return True else: raise checks.FilterFailure(u"Double quotes mismatch")
def _generic_check(str1, str2, regex, message): def get_fingerprint(string, is_source=False, translation=""): chunks = regex.split(string) d = {} fingerprint = "" if is_source and len(chunks) == 1: raise SkipCheck() for index, chunk in enumerate(chunks): # Chunks contain ordinary text in even positions, so they are safe # to be skipped. if index % 2 == 0: continue # special text if chunk in d: d[chunk] += 1 else: d[chunk] = 1 for key in sorted(d.keys()): fingerprint += u"\001%s\001%s" % (key, d[key]) return fingerprint if check_translation(get_fingerprint, str1, str2): return True raise checks.FilterFailure(message)
def linebreaks_double(self, str1, str2, **kwargs): source_parts_count = len(linebreaks_double_regex.split(str1)) target_parts_count = len(linebreaks_double_regex.split(str2)) if source_parts_count != target_parts_count: raise checks.FilterFailure("Double line breaks mismatch") return True
def plurr_format(self, str1, str2, **kwargs): """For plurr-formatted strings, checks the syntax is correct.""" # Ignore check for empty target strings or non Plurr-formatted # source strings if str2 == u"" or not plurr_format_regex.search(str1): return True # Ignore check if library is missing try: from plurr import Plurr except ImportError: return True plurr = Plurr() try: plurr.format( str2, {}, { "locale": kwargs["language_code"], "strict": False, "callback": lambda x: "", }, ) except SyntaxError as e: raise checks.FilterFailure(str(e)) return True
def plurr_format(self, str1, str2, **kwargs): """For plurr-formatted strings, checks the syntax is correct.""" # Ignore check for empty target strings or non Plurr-formatted # source strings if str2 == u'' or not plurr_format_regex.search(str1): return True # Ignore check if library is missing try: from plurr import Plurr except ImportError: return True plurr = Plurr() try: plurr.format(str2, {}, { 'locale': kwargs['language_code'], 'strict': False, 'callback': lambda x: '', }) except SyntaxError as e: raise checks.FilterFailure(e.message) return True
def doublequoting(self, str1, str2): """Checks whether double quotation mark `"` is consistent between the two strings. """ def get_fingerprint(str, is_source=False, translation=''): chunks = str.split('"') translate = False double_quote_count = 0 for chunk in chunks: translate = not translate if translate: # ordinary text (safe to translate) continue double_quote_count += 1 fingerprint = u"%d\001" % double_quote_count return fingerprint if check_translation(get_fingerprint, str1, str2): return True else: raise checks.FilterFailure(u"Double quotes mismatch")
def date_format(self, str1, str2): def get_fingerprint(str, is_source=False, translation=''): if is_source: if not date_format_regex_0.match(str): raise SkipCheck() # filter out specific English strings which are not dates if date_format_regex_1.match(str): raise SkipCheck() # filter out specific translation pairs if date_format_regex_2.match(str): if date_format_regex_3.match(translation): raise SkipCheck() if date_format_regex_4.match(str): if date_format_regex_5.match(translation): raise SkipCheck() if date_format_regex_6.match(str): if date_format_regex_7.match(translation): raise SkipCheck() fingerprint = u"\001".join(sorted(date_format_regex_8.split(str))) return fingerprint if check_translation(get_fingerprint, str1, str2): return True else: raise checks.FilterFailure(u"Incorrect date format")
def test_check(self, str1, str2): def get_fingerprint(str, is_source=False, translation=''): return 0 if check_translation(get_fingerprint, str1, str2): return True else: raise checks.FilterFailure(u"Incorrect test check")
def unescaped_ampersands(self, str1, str2): if escaped_entities_regex.search(str1): chunks = broken_ampersand_regex.split(str2) if len(chunks) == 1: return True raise checks.FilterFailure(u"Unescaped ampersand mismatch") return True
def accelerators(self, str1, str2, **kwargs): def get_fingerprint(string, is_source=False, translation=""): # special rule for banner images in the web client which are # translated differently, e.g.: # From: <img src="/images/account/bnr_allow.gif" # alt="Allow Account Access" /> # To: <h1>Allow Konto Zugriff</h1> if is_source: if img_banner_regex.match(string): raise SkipCheck() # temporarily escape HTML entities s = accelerators_regex_0.sub(r"\001\1\001", string) chunks = accelerators_regex_1.split(s) translate = False ampersand_count = 0 underscore_count = 0 circumflex_count = 0 regex = re.compile(r"\001(\w+)\001") for chunk in chunks: translate = not translate if translate: # ordinary text (safe to translate) continue # special text if chunk == "&": ampersand_count += 1 if chunk == "_": underscore_count += 1 if chunk == "^": circumflex_count += 1 # restore HTML entities (will return chunks later) chunk = regex.sub(r"&\1;", chunk) fingerprint = u"%d\001%d\001%d" % ( ampersand_count, underscore_count, circumflex_count, ) return fingerprint # Ignore check for Plurr-formatted strings if plurr_format_regex.search(str1): return True if check_translation(get_fingerprint, str1, str2): return True raise checks.FilterFailure(u"Accelerator mismatch")
def incorrectly_escaped_ampersands(self, str1, str2, **kwargs): if escaped_entities_regex.search(str2): chunks = broken_ampersand_regex.split(str1) if len(chunks) == 1: chunks = broken_ampersand_regex.split(str2) if len(chunks) == 1: return True raise checks.FilterFailure(u"Escaped ampersand mismatch") return True
def linebreaks_multiple(self, str1, str2, **kwargs): source_counts = [ match.group().count("\n") for match in linebreaks_multiple_regex.finditer(str1) ] target_counts = [ match.group().count("\n") for match in linebreaks_multiple_regex.finditer(str2) ] if source_counts != target_counts: raise checks.FilterFailure("Multiple line breaks mismatch") return True
def plurr_placeholders(self, str1, str2, **kwargs): """For plurr-formatted strings, checks placeholders used in target strings actually exist in the source string. """ if str2 == u'' or not plurr_placeholders_regex.search(str1): return True placeholders_source = map( clean_plurr_placeholder, filter(None, reduce(lambda x, y: x + y, map(list, plurr_placeholders_regex.findall(str1)), [])) ) placeholders_target = map( clean_plurr_placeholder, filter(None, reduce(lambda x, y: x + y, map(list, plurr_placeholders_regex.findall(str2)), [])) ) if set(placeholders_source) == set(placeholders_target): return True unknown_in_target = set(placeholders_target) - set(placeholders_source) if len(unknown_in_target) > 0: raise checks.FilterFailure( u'Unknown placeholders in translation: %s' % u', '.join(unknown_in_target) ) missing_in_translation = set(placeholders_source) - set(placeholders_target) if len(missing_in_translation) > 0: raise checks.FilterFailure( u'Placeholders missing in translation: %s' % u', '.join(missing_in_translation) ) return True
def mustache_placeholder_pairs(self, str1, str2): def get_fingerprint(str, is_source=False, translation=''): chunks = mustache_placeholder_pairs_regex.split(str) translate = False fingerprint = 1 if is_source: if not mustache_placeholder_pairs_open_tag_regex.search(str1): raise SkipCheck() return fingerprint stack = [] for chunk in chunks: translate = not translate if translate: # ordinary text (safe to translate) continue # special text tag = chunk[3:-2] # extract 'tagname' from '{{#tagname}}' if chunk[2:3] in ['#', '^']: # opening tag # check that all similar tags were closed if tag in stack: fingerprint = 0 break stack.append(tag) else: # closing tag '{{/tagname}}' if len(stack) == 0 or not stack[-1] == tag: fingerprint = 0 break else: stack.pop() if len(stack) > 0: fingerprint = 0 return fingerprint if check_translation(get_fingerprint, str1, str2): return True else: raise checks.FilterFailure(u"mustache_placeholder_pairs")
def double_quotes_in_tags(self, str1, str2): """Checks whether double quotation mark `"` in tags is consistent between the - two strings. """ def get_fingerprint(str, is_source=False, translation=''): chunks = unbalanced_tag_braces_regex.split(str) translate = False level = 0 d = {} fingerprint = '' quotes_paired = True for chunk in chunks: translate = not translate if translate: if level > 0: d[level] += chunk.count('"') continue # special text if level >= 0: if chunk == '<': level += 1 if level not in d: d[level] = 0 if chunk == '>': level -= 1 for key in sorted([x for x in d.keys() if d[x] > 0]): fingerprint += u"\001%s\001%s" % (key, d[key]) quotes_paired &= d[key] % 2 == 0 return fingerprint, quotes_paired # hardcoded rule: skip web banner images which are translated # differently if img_banner_regex.match(str1): return True fingerprint1, paired1 = get_fingerprint(str1, is_source=True) if paired1: fingerprint2, paired2 = get_fingerprint(str2, is_source=False) if fingerprint1 == '' and paired2 or fingerprint1 == fingerprint2: return True raise checks.FilterFailure(u"Double quotes in tags mismatch")
def date_format(self, str1, str2, **kwargs): def get_fingerprint(string, is_source=False, translation=""): is_date_format = bool(date_format_regex.match(string)) if is_source: if not is_date_format: raise SkipCheck() # filter out specific English strings which are not dates if date_format_exception_regex.match(string): raise SkipCheck() return is_date_format if check_translation(get_fingerprint, str1, str2): return True raise checks.FilterFailure(u"Incorrect date format")
def mustache_like_placeholder_pairs(self, str1, str2): def get_fingerprint(str, is_source=False, translation=''): chunks = mustache_like_placeholder_pairs_regex.split(str) translate = False fingerprint = 1 d = {} if is_source: if mustache_placeholder_pairs_open_tag_regex.search(str1): raise SkipCheck() return fingerprint for chunk in chunks: translate = not translate if translate: # ordinary text (safe to translate) continue # special text if chunk[2:3] != '/': # opening tag tag = chunk[2:-2] if chunk not in d: d[tag] = 1 else: d[tag] += 1 else: # closing tag # extract 'tagname' from '{{/tagname}}' tag = chunk[3:-2] if tag not in d or d[tag] == 0: fingerprint = None break d[tag] -= 1 return fingerprint if check_translation(get_fingerprint, str1, str2): return True else: raise checks.FilterFailure(u"mustache_like_placeholder_pairs")
def tags_differ(self, str1, str2): def get_fingerprint(str, is_source=False, translation=''): if is_source: # hardcoded rule: skip web banner images which are translated # differently if img_banner_regex.match(str): raise SkipCheck() # hardcoded rules for strings that look like tags but are # not them if no_tags_regex.match(str): raise SkipCheck() chunks = tags_differ_regex_0.split(str) translate = False fingerprint = '' d = {} for chunk in chunks: translate = not translate if translate: # ordinary text (safe to translate) continue # special text mo = tags_differ_regex_1.match(chunk) if mo: tag = mo.group(1) if tag in d: d[tag] += 1 else: d[tag] = 1 for key in sorted(d.keys()): fingerprint += u"\001%s\001%s" % (key, d[key]) return fingerprint if check_translation(get_fingerprint, str1, str2): return True else: raise checks.FilterFailure(u"Tags differ")
def c_format(self, str1, str2): def get_fingerprint(str, is_source=False, translation=''): chunks = c_format_regex.split(str) translate = False fingerprint = '' for chunk in chunks: translate = not translate if translate: # ordinary text (safe to translate) continue # special text fingerprint += u"\001%s" % chunk return fingerprint if check_translation(get_fingerprint, str1, str2): return True else: raise checks.FilterFailure(u"Incorrect C format")
def unescaped_ampersands(self, str1, str2): def get_fingerprint(str, is_source=False, translation=''): # skip comparing strings if there are no ampersands in the # translation if is_source and u"&" not in translation: return None chunks = unescaped_ampersands_regex.split(str) translate = False fingerprint = 0 escaped_count = 0 unescaped_count = 0 for chunk in chunks: translate = not translate if translate: # ordinary text (safe to translate) continue # special text if chunk == '&': unescaped_count += 1 else: escaped_count += 1 # fingerprint will not count the number of & or &, but # just the fact of their presence if unescaped_count > 0: fingerprint = 2 if escaped_count > 0: fingerprint += 1 return fingerprint if check_translation(get_fingerprint, str1, str2): return True else: raise checks.FilterFailure(u"Unescaped ampersand mismatch")
def non_printable(self, str1, str2): def get_fingerprint(str, is_source=False, translation=''): chunks = non_printable_regex.split(str) translate = False fingerprint = '' for chunk in chunks: translate = not translate if translate: # ordinary text (safe to translate) continue # special text chunk = '{0x%02x}' % ord(chunk) fingerprint += u"\001%s" % chunk return fingerprint if check_translation(get_fingerprint, str1, str2): return True else: raise checks.FilterFailure(u"Non printable mismatch")
def unbalanced_curly_braces(self, str1, str2, **kwargs): def get_fingerprint(string, is_source=False, translation=""): chunks = unbalanced_curly_braces_regex.split(string) translate = False count = 0 level = 0 for chunk in chunks: translate = not translate if translate: # ordinary text (safe to translate) continue # special text count += 1 if level >= 0: if chunk == "{": level += 1 if chunk == "}": level -= 1 fingerprint = u"%d\001%d" % (count, level) # if source string has unbalanced tags, always report it if is_source and not level == 0: # just make the fingerprint different by one symbol fingerprint += u"\001" return fingerprint # Ignore check for Plurr-formatted strings if plurr_format_regex.search(str1): return True if check_translation(get_fingerprint, str1, str2): return True raise checks.FilterFailure(u"Unbalanced curly braces")
def potential_unwanted_placeholders(self, str1, str2): def get_fingerprint(str, is_source=False, translation=''): chunks = potential_placeholders_regex.split(str) translate = False fingerprint = 0 for chunk in chunks: translate = not translate if translate: # ordinary text (safe to translate) continue # placeholder sign fingerprint += 1 return fingerprint a_fingerprint = get_fingerprint(str1, True, str2) b_fingerprint = get_fingerprint(str2, False, str1) if a_fingerprint >= b_fingerprint: return True else: raise checks.FilterFailure(u"Potential unwanted placeholders")
def broken_entities(self, str1, str2): def get_fingerprint(str, is_source=False, translation=''): chunks = broken_entities_regex_0.split(str) translate = False fingerprint = 1 for chunk in chunks: translate = not translate if translate: # ordinary text (safe to translate) continue # special text # check if ';' is present at the end for some known named # entities that should never match as false positives in # the normal text if broken_entities_regex_1.match(chunk): fingerprint += 1 # check if ';' is present at the end for numeric and # hexadecimal entities if broken_entities_regex_2.match(chunk): fingerprint += 1 # check if a prefix '#' symbol is missing for a numeric # entity if broken_entities_regex_3.match(chunk): fingerprint += 1 # check if a prefix '#' symbol is missing for a hexadecimal # entity if broken_entities_regex_4.match(chunk): fingerprint += 1 # check if a prefix 'x' symbol is missing (or replaced with # something else) for a hexadecimal entity mo = broken_entities_regex_5.match(chunk) if mo: regex = re.compile(u"\D", re.U) if regex.match(mo.group(1)) or \ regex.match(mo.group(2)): fingerprint += 1 # the checks below are conservative, i.e. they do not include # the full valid Unicode range but just test for common # mistakes in real-life XML/HTML entities # check if a numbered entity is within acceptable range mo = broken_entities_regex_6.match(chunk) if mo: number = int(mo.group(1)) if number > 65535: fingerprint += 1 # check if a hexadecimal numbered entity length is within # acceptable range mo = broken_entities_regex_7.match(chunk) if mo: v = int(mo.group(1), 16) if v > 65535: fingerprint += 1 if is_source and fingerprint > 1: fingerprint = u"%d\001" % fingerprint return fingerprint if check_translation(get_fingerprint, str1, str2): return True else: raise checks.FilterFailure(u"Broken HTML entities")