def string_handler(token, template): """ Extra checks and manipulation of extracted string from markdown file. Parameters: token: Tuple of (string, string_type) where string_type refers to the type of markdown element this string belongs to. string_type can be None. template: the template of the resource returns: the manipulated string or None in case the manipulated string is not valid anymore e.g. empty string """ # Drop new lines around string. string, key = token string = string.strip('\n') # for code blocks we need to maintain the exact indentation as in # the source file both for matching the string and replacing it in the # template and for producing a valid markdown on compilation if key == 'block_code': lines = string.split('\n') line = lines[0] spaces = re.findall( ensure_unicode(r'\n( *){}').format(re.escape(line)), template)[0] if spaces: string = '' for line in lines: line = u'{}{}'.format(spaces, line) string += '\n' string += line # Line is a liquid template tag, ignore. if string.startswith('{%') and string.endswith('%}'): return # Drop # chars from beginning of the string match_header_line = re.search(ensure_unicode(r'^#+\s'), string) if match_header_line: return string.replace(match_header_line.group(), '') # Extract Text from `[Text]: link` or `"[Text]: link"` lines match_reference = re.search(ensure_unicode(r'^"?\[([^\[^\]]+)\]:.+"?$'), string) if match_reference: try: int(match_reference.groups()[0]) except ValueError: # Get content between brackets if it's not an integer number return match_reference.groups()[0] return # exclude numeric values from stringset try: float(string) return except ValueError: pass return string
def parse(self, content, **kwargs): newline_type = find_newline_type(content) if newline_type == 'DOS': content = force_newline_type(content, 'UNIX') # mistune expands tabs to 4 spaces and trims trailing spaces, so we # need to do the same in order to be able to match the substrings template = content.expandtabs(4) pattern = re.compile(ensure_unicode(r'^ +$'), re.M) content = pattern.sub('', template) template = content stringset = [] yml_header = re.match( ensure_unicode(r'^(---\s+)([\s\S]*?[^`])\s*(\n---\s+)(?!-)'), content ) yaml_header_content = '' yaml_stringset = [] if yml_header: yaml_header_content = yml_header.group() md_content = content[len(yaml_header_content):] yaml_stringset = self.yaml_parser(yaml_header_content) else: md_content = content block = TxBlockLexer() markdown = Markdown(block=block) # Making sure stringset is empty because of recursive inside `markdown` block.md_stringset = [] # Command that populates block.stringset var markdown(md_content) order = 0 curr_pos = 0 for string in (yaml_stringset + block.md_stringset): string = string_handler(string, template) if string and string in template[curr_pos:]: string_object = OpenString(six.text_type(order), string, order=order) order += 1 stringset.append(string_object) # Keep track of the index of the last replaced hash template = template[:curr_pos] + template[curr_pos:].replace( string, string_object.template_replacement, 1 ) curr_pos = template.find(string_object.template_replacement) curr_pos = curr_pos + len(string_object.template_replacement) return force_newline_type(template, newline_type), stringset
def parse(self, content, **kwargs): newline_type = find_newline_type(content) if newline_type == 'DOS': content = force_newline_type(content, 'UNIX') # mistune expands tabs to 4 spaces and trims trailing spaces, so we # need to do the same in order to be able to match the substrings template = content.expandtabs(4) pattern = re.compile(ensure_unicode(r'^ +$'), re.M) content = pattern.sub('', template) template = content stringset = [] yml_header = re.match( ensure_unicode(r'^(---\s+)([\s\S]*?[^`])\s*(\n---\s+)(?!-)'), content) yaml_header_content = '' yaml_stringset = [] if yml_header: yaml_header_content = yml_header.group() md_content = content[len(yaml_header_content):] yaml_stringset = self.yaml_parser(yaml_header_content) else: md_content = content block = TxBlockLexer() markdown = Markdown(block=block) # Making sure stringset is empty because of recursive inside `markdown` block.md_stringset = [] # Command that populates block.stringset var markdown(md_content) order = 0 curr_pos = 0 for string in (yaml_stringset + block.md_stringset): string = string_handler(string, template) if string and string in template[curr_pos:]: string_object = OpenString(six.text_type(order), string, order=order) order += 1 stringset.append(string_object) # Keep track of the index of the last replaced hash template = template[:curr_pos] + template[curr_pos:].replace( string, string_object.template_replacement, 1) curr_pos = template.find(string_object.template_replacement) curr_pos = curr_pos + len(string_object.template_replacement) return force_newline_type(template, newline_type), stringset
def _compile_story(self, story_content): """ Handles the compilation of a single story args: story_content: the xml content of the story returns: compiled_story: the compiled story content """ transcriber = Transcriber(story_content) hash_regex = re.compile(ensure_unicode(r'[a-z,0-9]{32}_tr')) found = True while found: try: current_string = self.stringset.pop(0) hash_position = story_content.index( current_string.template_replacement) except ValueError: found = False self.stringset.insert(0, current_string) except IndexError: break else: transcriber.copy_until(hash_position) transcriber.add(self._escape_amps(current_string.string)) transcriber.skip(len(current_string.template_replacement)) # Update the XML file to contain the template strings transcriber.copy_until(len(story_content)) compiled_story = transcriber.get_destination() # in case there are any hashes that have not been replaced, replace # them with an empty string compiled_story = hash_regex.sub(u'', compiled_story) return compiled_story
def yaml_parser(self, yaml_header): # TODO: This is a temporary solution. Yaml header should be parsed # with an actual yaml parser when it is implemented in openformats yaml_strings = [] block = False block_string = '' indent = 0 for line in yaml_header.splitlines(): # ignore comments if line.startswith('#'): continue if block: # at least 2 spaces more indented that the parent line if line.startswith(' ' * (indent + 2)): block_string += line block_string += '\n' continue else: yaml_strings.append((block_string, None)) block_string = '' block = False key_value = line.split(':', 1) if len(key_value) == 2: value = key_value[1].strip() # we parse all the lines that follow the '|' or '>' symbols # and are at least 2 spaces more intented that the parent line # as one string if value and value in '|>[': indent = len( re.search(ensure_unicode(r'^( *)'), key_value[0]).group(0)) block = True continue yaml_strings.append((value, None)) return yaml_strings
def find_closing(self, start): # assume start is on a '<' if self.content[start:start + 4] == "<!--": # Special case for comment closing_start = self.content[start:].index("-->") return start + closing_start, start + closing_start + 3 opening_match = self.opening_tag_pat.search(self.content[start:]) if self.single_tag_pat.search(opening_match.group()): # Single tag, eg `<foo a="b" />` return start + opening_match.end(), start + opening_match.end() tag_name = opening_match.groupdict()['name'] tag_pat = re.compile( ensure_unicode(r'\<(?:(?:{tag_name})|(?:/{tag_name}\>))'.format( tag_name=re.escape(tag_name)))) match_generator = tag_pat.finditer(self.content[start:]) first_match = next(match_generator) assert first_match and first_match.start() == 0 and\ first_match.group()[1] != '/' count = 1 for match in match_generator: matched = match.group() if matched[1] == '/' or matched == "-->": # closing tag count -= 1 else: count += 1 if count == 0: return start + match.start(), start + match.end()
def yaml_parser(self, yaml_header): # TODO: This is a temporary solution. Yaml header should be parsed # with an actual yaml parser when it is implemented in openformats yaml_strings = [] block = False block_string = '' indent = 0 for line in yaml_header.splitlines(): # ignore comments if line.startswith('#'): continue if block: # at least 2 spaces more indented that the parent line if line.startswith(' ' * (indent + 2)): block_string += line block_string += '\n' continue else: yaml_strings.append((block_string, None)) block_string = '' block = False key_value = line.split(':', 1) if len(key_value) == 2: value = key_value[1].strip() # we parse all the lines that follow the '|' or '>' symbols # and are at least 2 spaces more intented that the parent line # as one string if value and value in '|>[': indent = len(re.search(ensure_unicode(r'^( *)'), key_value[0]). group(0)) block = True continue yaml_strings.append((value, None)) return yaml_strings
def _compile_story(self, story_content): """ Handles the compilation of a single story args: story_content: the xml content of the story returns: compiled_story: the compiled story content """ transcriber = Transcriber(story_content) hash_regex = re.compile(ensure_unicode(r'[a-z,0-9]{32}_tr')) found = True while found: try: current_string = self.stringset.pop(0) hash_position = story_content.index( current_string.template_replacement ) except ValueError: found = False self.stringset.insert(0, current_string) except IndexError: break else: transcriber.copy_until(hash_position) transcriber.add(current_string.string) transcriber.skip(len(current_string.template_replacement)) # Update the XML file to contain the template strings transcriber.copy_until(len(story_content)) compiled_story = transcriber.get_destination() # in case there are any hashes that have not been replaced, replace # them with an empty string compiled_story = hash_regex.sub(u'', compiled_story) return compiled_story
def find(self, tags=[]): if isinstance(tags, (six.binary_type, six.text_type)): tags = [tags] if not tags: pat = re.compile(ensure_unicode(r'\<'), re.DOTALL) else: pat = re.compile( ensure_unicode(r'\<(?:{})'.format('|'.join( (re.escape(tag) for tag in tags)))), re.DOTALL) for match in pat.finditer(self.content): if match.start() == 0 or self._is_within_comment(match): continue closing_start, closing_end = self.find_closing(match.start()) found = DumbXml(self.content[match.start():closing_end]) if not tags or found.name in tags: offset = match.start() yield found, offset
def __init__(self, content): """ Does some parsing and sets the following attributes to `self`: * content: The content of the tag, including the opening/closing tags * name: The name of the tag * attrs: A dictionary of all the attributes of the tag with their values * inner_offset: the place of the character where the inner content of the tag starts, aka the length of the opening tag * inner: the inner content of the tag """ # Fix regex encoding self.opening_tag_pat = re.compile(ensure_unicode(self.OPENING_TAG_PAT), re.DOTALL) self.attr_pat = re.compile(ensure_unicode(self.ATTR_PAT)) self.single_tag_pat = re.compile(ensure_unicode(self.SINGLE_TAG_PAT)) self.content = content if self.content[:4] == "<!--": # Special case for comment self.inner_offset = 4 self.name = self.COMMENT self.attrs = {} self.inner = self.content[4:self.content.index("-->")] return opening_match = self.opening_tag_pat.search(content) self.inner_offset = opening_match.end() self.name = opening_match.groupdict()['name'] attrs = opening_match.groupdict()['attrs'] self.attrs = {} for match in self.attr_pat.finditer(attrs): self.attrs[match.groupdict()['key']] = match.groupdict()['value'] closing_start, closing_end = self.find_closing(0) self.inner = self.content[opening_match.end():closing_start]
def compile(self, template, stringset, **kwargs): # Fix regex encoding space_pattern = re.compile(ensure_unicode(self.SPACE_PAT)) # assume stringset is ordered within the template transcriber = Transcriber(template) template = transcriber.source for string in stringset: hash_position = template.index(string.template_replacement) if not string.pluralized: transcriber.copy_until(hash_position) transcriber.add(string.string) transcriber.skip(len(string.template_replacement)) else: # if the hash is on its own on a line with only spaces, we have # to remember it's indent indent_length = template[hash_position::-1].index('\n') - 1 indent = template[hash_position - indent_length:hash_position] tail_length = template[ hash_position + len(string.template_replacement): ].index('\n') tail = template[ hash_position + len(string.template_replacement): hash_position + len(string.template_replacement) + tail_length ] if (space_pattern.search(indent) and space_pattern.search(tail)): transcriber.copy_until(hash_position - indent_length) for rule, value in six.iteritems(string.string): transcriber.add( indent + self.plural_template.format( rule=self.RULES_ITOA[rule], string=value ) + tail + '\n' ) transcriber.skip(indent_length + len(string.template_replacement) + tail_length + 1) else: # string is not on its own, simply replace hash with all # plural forms transcriber.copy_until(hash_position) for rule, value in six.iteritems(string.string): transcriber.add(self.plural_template.format( rule=self.RULES_ITOA[rule], string=value )) transcriber.skip(len(string.template_replacement)) transcriber.copy_until(len(template)) compiled = transcriber.get_destination() return compiled
def find(self, tags=[]): if isinstance(tags, (six.binary_type, six.text_type)): tags = [tags] if not tags: pat = re.compile(ensure_unicode(r'\<'), re.DOTALL) else: pat = re.compile( ensure_unicode(r'\<(?:{})'. format('|'.join((re.escape(tag) for tag in tags)))), re.DOTALL ) for match in pat.finditer(self.content): if match.start() == 0 or self._is_within_comment(match): continue closing_start, closing_end = self.find_closing(match.start()) found = DumbXml(self.content[match.start():closing_end]) if not tags or found.name in tags: offset = match.start() yield found, offset
def _find_and_replace(self, story_xml): """ Finds all the translatable content in the given XML string replaces it with the string_hash and returns the resulting template while updating `self.stringset` in the process. args: story_xml (str): The xml content of a single Story of the IDML file returns: the input string with all translatable content replaced by the md5 hash of the string. """ template = re.sub(ensure_unicode(self.CONTENT_REGEX), self._replace, story_xml) return template
def compile(self, template, stringset, **kwargs): # Fix regex encoding space_pattern = re.compile(ensure_unicode(self.SPACE_PAT)) # assume stringset is ordered within the template transcriber = Transcriber(template) template = transcriber.source for string in stringset: hash_position = template.index(string.template_replacement) if not string.pluralized: transcriber.copy_until(hash_position) transcriber.add(string.string) transcriber.skip(len(string.template_replacement)) else: # if the hash is on its own on a line with only spaces, we have # to remember it's indent indent_length = template[hash_position::-1].index('\n') - 1 indent = template[hash_position - indent_length:hash_position] tail_length = template[hash_position + len(string.template_replacement ):].index('\n') tail = template[hash_position + len(string.template_replacement ):hash_position + len(string.template_replacement) + tail_length] if (space_pattern.search(indent) and space_pattern.search(tail)): transcriber.copy_until(hash_position - indent_length) for rule, value in six.iteritems(string.string): transcriber.add(indent + self.plural_template.format( rule=self.RULES_ITOA[rule], string=value) + tail + '\n') transcriber.skip(indent_length + len(string.template_replacement) + tail_length + 1) else: # string is not on its own, simply replace hash with all # plural forms transcriber.copy_until(hash_position) for rule, value in six.iteritems(string.string): transcriber.add( self.plural_template.format( rule=self.RULES_ITOA[rule], string=value)) transcriber.skip(len(string.template_replacement)) transcriber.copy_until(len(template)) compiled = transcriber.get_destination() return compiled
def _is_custom_tag(self, tag): """ Check whether a value is tagged with a custom type. Detect custom tags, like: `foo: !bar test` `foo: !xml "<bar>Bar</bar>"` Built-in types, indicated by a `!!` prefix, will not be matched. We can't preserve the information whether a built-in tag like `!!str` was used for a value since the PyYAML library will tag such entries with the built-in identifier. For example `tag:yaml.org,2002:str`, not `!!str`. """ return re.match(ensure_unicode(r'^[\![a-zA-Z_]*]*$'), tag, re.IGNORECASE)
def _can_skip_content(self, string): """ Checks if the contents of an XML files are translateable. Strings that contain only special characters or can be evaluated to a nunber are skipped. """ stripped_string = re.\ sub(ensure_unicode(self.SPECIAL_CHARACTERS_REGEX), u'', string).\ strip() if not stripped_string: return True try: float(string.strip()) return True except ValueError: pass if not self._contains_translatable_character(stripped_string): return True return False
def parse(self, key, value): """ Parse a string that follows a subset of the the ICU message format and return an ICUString object. For the time being, only the plurals format is supported. If `value` doesn't match the proper format, it will return None. This method will also update the transcriber accordingly. Note: if we want to support more ICU features in the future, this would probably have to be refactored. :param key: the string key :param value: the serialized string that has all the content, formatted like this (whitespace irrelevant): { item_count, plural, one { You have {file_count} file. } other { You have {file_count} files. } } :return: an ICUString object with all parsed information or None if the string is not in the supported ICU format :rtype: ICUString :raise ParseError: if the given string looks a lot like an ICU plural string but has an invalid structure """ matches = re.match( ensure_unicode( r'\s*{\s*([A-Za-z-_\d]+)\s*,\s*([A-Za-z_]+)\s*,\s*(.*)}\s*'), value) if not matches: return None keyword, argument, serialized_strings = matches.groups() if argument == ICUParser.PLURAL_ARG: return self._parse_pluralized_string( key, keyword, value, serialized_strings, ) return None
def parse(self, key, value): """ Parse a string that follows a subset of the the ICU message format and return an ICUString object. For the time being, only the plurals format is supported. If `value` doesn't match the proper format, it will return None. This method will also update the transcriber accordingly. Note: if we want to support more ICU features in the future, this would probably have to be refactored. :param key: the string key :param value: the serialized string that has all the content, formatted like this (whitespace irrelevant): { item_count, plural, one { You have {file_count} file. } other { You have {file_count} files. } } :return: an ICUString object with all parsed information or None if the string is not in the supported ICU format :rtype: ICUString :raise ParseError: if the given string looks a lot like an ICU plural string but has an invalid structure """ matches = re.match( ensure_unicode( r'\s*{\s*([A-Za-z-_\d]+)\s*,\s*([A-Za-z_]+)\s*,\s*(.*)}\s*' ), value ) if not matches: return None keyword, argument, serialized_strings = matches.groups() if argument == ICUParser.PLURAL_ARG: return self._parse_pluralized_string( key, keyword, value, serialized_strings, ) return None
def _get_indent(self, template): """ Use a regular expression to figure out how many spaces are used for indentation in the original file. Args: template: The saved template Returns: The number of spaces. """ # Match all whitespace characters after first `:` (end of first key). # Stops on first non whitespace character. indent_pattern = re.compile( ensure_unicode(r':\r?\n(?P<indent>[ \t\n]+)')) m = indent_pattern.search(template) indent = m.groups('indent')[0] if m else ' ' * 2 # keep only last line indent = indent.splitlines()[-1] indent = indent.replace('\t', ' ' * 4) return len(indent)
def _get_indent(self, template): """ Use a regular expression to figure out how many spaces are used for indentation in the original file. Args: template: The saved template Returns: The number of spaces. """ # Match all whitespace characters after first `:` (end of first key). # Stops on first non whitespace character. indent_pattern = re.compile( ensure_unicode(r':\r?\n(?P<indent>[ \t\n]+)') ) m = indent_pattern.search(template) indent = m.groups('indent')[0] if m else ' ' * 2 # keep only last line indent = indent.splitlines()[-1] indent = indent.replace('\t', ' ' * 4) return len(indent)
def find_closing(self, start): # assume start is on a '<' if self.content[start:start + 4] == "<!--": # Special case for comment closing_start = self.content[start:].index("-->") return start + closing_start, start + closing_start + 3 opening_match = self.opening_tag_pat.search(self.content[start:]) if self.single_tag_pat.search(opening_match.group()): # Single tag, eg `<foo a="b" />` return start + opening_match.end(), start + opening_match.end() tag_name = opening_match.groupdict()['name'] tag_pat = re.compile( ensure_unicode( r'\<(?:(?:{tag_name})|(?:/{tag_name}\>))'. format(tag_name=re.escape(tag_name)) ) ) match_generator = tag_pat.finditer(self.content[start:]) first_match = next(match_generator) assert first_match and first_match.start() == 0 and\ first_match.group()[1] != '/' count = 1 for match in match_generator: matched = match.group() if matched[1] == '/' or matched == "-->": # closing tag count -= 1 else: count += 1 if count == 0: return start + match.start(), start + match.end()
class InDesignHandler(Handler): """A handler class that parses and compiles .idml files that are created in Adobe's InDesign. IDML files contain multiple XML fragments that can be parsed to extract strings from. """ name = "InDesign" extension = "idml" SPECIFIER = None PROCESSES_BINARY = True EXTRACTS_RAW = False # The ? at the end of the string regex, makes it non-greedy in order to # allow trailing spaces to be preserved CONTENT_REGEX = r'(<Content>\s*)(.*?)(\s*</Content>)' SPECIAL_CHARACTERS_REGEX = re.compile( ensure_unicode(r'<\?ACE \d+\?>|<Br/>;')) """ Parse Methods """ def __init__(self, *args, **kwargs): self.order = count() self.stringset = [] super(InDesignHandler, self).__init__(*args, **kwargs) def parse(self, content, **kwargs): """ Parses .idml file content and returns the resource template and stringset. * Use UCF to unpack `content` to xml fragments * Parse all Story fragments to extract the translatable strings and replace them with a replacement hash * Pack the fragments back to create the template * Return the (template, stringset) tuple """ idml = UCF(io.BytesIO(content)) ordered_stories = self._get_ordered_stories(idml) # Iterate over the contents of the IDML file for key in ordered_stories: try: # No matter what, idml values are bytes story_content = idml[key].decode('utf8') except KeyError: continue story_content = self._find_and_replace(story_content) # Update the XML file to contain the template strings idml[key] = story_content.encode('utf-8') out = io.BytesIO() idml.save(out) template = out.getvalue() return template, self.stringset def _get_ordered_stories(self, idml): """ Try to find the order the stories appear in the indesign document * Parse designmap.xml to get the StoryList attribute. * Return a list with the idml keys of the stories in the order they appear in StoryList """ STORY_KEY = 'Stories/Story_{}.xml' BACKING_STORY = 'XML/BackingStory.xml' designmap = idml.get('designmap.xml') parser = etree.XMLParser(resolve_entities=False) designmap_tree = etree.fromstring(designmap, parser=parser) story_ids = designmap_tree.attrib.get("StoryList", "").split() story_keys = [STORY_KEY.format(s) for s in story_ids] # In case there are stories that is not referenced in designmap.xml, # append them at the end of the list all_stories = { k for k in six.iterkeys(idml) if k.startswith('Stories') or k == BACKING_STORY } story_keys.extend(all_stories - set(story_keys)) return story_keys def _can_skip_content(self, string): """ Checks if the contents of an XML files are translateable. Strings that contain only special characters or can be evaluated to a nunber are skipped. """ stripped_string = re.\ sub(ensure_unicode(self.SPECIAL_CHARACTERS_REGEX), u'', string).\ strip() if not stripped_string: return True try: float(string.strip()) return True except ValueError: pass if not self._contains_translatable_character(stripped_string): return True return False def _contains_translatable_character(self, string): """ Checks if a string contains at least one character that can be translated. We assume that translatable characters are the letters, the symbols and the punctuation. """ acceptable = ["L", "P", "S"] for letter in string: char_type = unicodedata.category(letter) if char_type[0] in acceptable: return True return False def _find_and_replace(self, story_xml): """ Finds all the translatable content in the given XML string replaces it with the string_hash and returns the resulting template while updating `self.stringset` in the process. args: story_xml (str): The xml content of a single Story of the IDML file returns: the input string with all translatable content replaced by the md5 hash of the string. """ template = re.sub(ensure_unicode(self.CONTENT_REGEX), self._replace, story_xml) return template def _replace(self, match): """ Implements the logic used by `self.CONTENT_REGEX.sub(...)` to replace strings with their template replacement and appends new strings to `self.stringset`. """ opening_tag, string, closing_tag = match.groups() if self._can_skip_content(string): return match.group() order = next(self.order) string_object = OpenString(six.text_type(order), string, order=order) self.stringset.append(string_object) return u"".join( (opening_tag, string_object.template_replacement, closing_tag)) """ Compile Methods """ def compile(self, template, stringset, **kwargs): # The content is a binary IDML file idml = UCF(io.BytesIO(template)) self.stringset = list(stringset) # Iterate over the contents of the IDML file for key in self._get_ordered_stories(idml): try: story_content = idml[key] except KeyError: continue # no matter what, idml values are bytes story_content = idml[key].decode('utf-8') idml[key] = self._compile_story(story_content).encode('utf-8') out = io.BytesIO() idml.save(out) return out.getvalue() def _compile_story(self, story_content): """ Handles the compilation of a single story args: story_content: the xml content of the story returns: compiled_story: the compiled story content """ transcriber = Transcriber(story_content) hash_regex = re.compile(ensure_unicode(r'[a-z,0-9]{32}_tr')) found = True while found: try: current_string = self.stringset.pop(0) hash_position = story_content.index( current_string.template_replacement) except ValueError: found = False self.stringset.insert(0, current_string) except IndexError: break else: transcriber.copy_until(hash_position) transcriber.add(self._escape_amps(current_string.string)) transcriber.skip(len(current_string.template_replacement)) # Update the XML file to contain the template strings transcriber.copy_until(len(story_content)) compiled_story = transcriber.get_destination() # in case there are any hashes that have not been replaced, replace # them with an empty string compiled_story = hash_regex.sub(u'', compiled_story) return compiled_story @staticmethod def _escape_amps(string): """ Escape "lonely" `&` (ampersands). If a valid XML escape sequence is found, it is left as it is. Otherwise, any occurrences of `&` are replaced with `&`. Eg, "hello world" -> "hello world" "hello &world" -> "hello &world" "hello &world" -> "hello &world" "hello <world" -> "hello <world" "hello ਟworld" -> "hello ਟworld" "&֯&&" -> "&֯&&" """ # Find "lonely" ampersand positions by finding all ampersand positions # and subtracting the positions of ampersands that are part of valid # XML escape sequences all_amp_positions = { match.span()[0] for match in re.finditer(r'&', string) } escaped_amp_positions = { match.span()[0] for match in re.finditer( r'&(lt|gt|amp|apos|quot|#\d+|#x[0-9a-fA-F]+);', string) } target_positions = sorted(all_amp_positions - escaped_amp_positions) # Use Transcriber to replace lonely ampersands with '&' transcriber = Transcriber(string) for position in target_positions: transcriber.copy_until(position) transcriber.add('&') transcriber.skip(1) transcriber.copy_to_end() return transcriber.get_destination()
def __init__(self, *args, **kwargs): super(TxYamlLoader, self).__init__(*args, **kwargs) self.stream = args[0] self.post_block_comment_pattern = re.compile( ensure_unicode(r'(?:#.*\r?\n\s*)+$') )
def parse(self, content, **kwargs): newline_type = find_newline_type(content) if newline_type == 'DOS': content = force_newline_type(content, 'UNIX') # mistune expands tabs to 4 spaces and trims trailing spaces, so we # need to do the same in order to be able to match the substrings template = content.expandtabs(4) pattern = re.compile(ensure_unicode(r'^ +$'), re.M) content = pattern.sub('', template) stringset = [] yml_header = re.match( ensure_unicode(r'^(---\s+)([\s\S]*?[^`]\s*)(\n---\s+)(?!-)'), content ) yaml_header_content = '' yaml_stringset = [] yaml_template = '' seperator = '' if yml_header: yaml_header_content = ''.join(yml_header.group(1, 2)) seperator = yml_header.group(3) md_content = content[len(yaml_header_content + seperator):] yaml_template, yaml_stringset = YamlHandler().parse( yaml_header_content) else: md_content = content md_template = md_content block = TxBlockLexer() markdown = Markdown(block=block) # Making sure stringset is empty because of recursive inside `markdown` block.md_stringset = [] # Command that populates block.stringset var markdown(md_content) stringset.extend(yaml_stringset) order = len(stringset) curr_pos = 0 for string in block.md_stringset: string = string_handler(string, md_template) # Ignore any string that does not appear in the template, # We do this to avoid parsing strings that are not properly # handled by the Markdown library, such as ```code``` blocks if string and string in md_template[curr_pos:]: string_object = OpenString(six.text_type(order), string, order=order) order += 1 stringset.append(string_object) # Keep track of the index of the last replaced hash md_template = ( md_template[:curr_pos] + md_template[curr_pos:].replace( string, string_object.template_replacement, 1) ) curr_pos = md_template.find(string_object.template_replacement) curr_pos = curr_pos + len(string_object.template_replacement) template = yaml_template + seperator + md_template return force_newline_type(template, newline_type), stringset
def _handle_child_pairs(self, key_tag, dict_tag): """Handles the <key> tag and its <dict> value tag. Note that in order to avoid splitting strings we perform the following inline-replacement: <key>NSStringLocalizedFormatKey</key> <string>Look! There %#@mouse@ there</string> <key>mouse</key> <dict> <key>NSStringFormatSpecTypeKey</key> <string>NSStringPluralRuleType</string> <key>NSStringFormatValueTypeKey</key> <string>d</string> <key>one</key> <string>is a mouse</string> <key>other</key> <string>are %d mice</string> </dict> Becomes: <key>NSStringLocalizedFormatKey</key> <string>%#@mouse@</string> <key>mouse</key> <dict> <key>NSStringFormatSpecTypeKey</key> <string>NSStringPluralRuleType</string> <key>NSStringFormatValueTypeKey</key> <string>d</string> <key>one</key> <string>Look! There is a mouse there</string> <key>other</key> <string>Look! There are %d mice there</string> </dict> This is necessary to avoid splitting sentences in Transifex, or omit parts of the translatable content. We reference this inline-replacement in the comments below as [1]. :param key_tag: The <key> tag to be handled. :param dict_tag: The <dict> tag to be handled. :returns: A list containing the openstrings created. If no strings were created the list is empty. """ # The first key tag contains the main key main_key = self._handle_key(key_tag, main_key=True) dict_iterator = self._handle_dict(dict_tag) string_list = [] # A helper variable to save the prefix and suffix needed for the # inline-replacement [1]. text_extras = None for key_child in dict_iterator: # The second key contains the secondary key. secondary_key = self._handle_key(key_child) value_tag = self._get_key_value(dict_iterator, key_child) if secondary_key == self.KEY_FORMAT: matches = re.match(ensure_unicode(self.VALUE_CONTENT_RE), value_tag.content) if matches is not None: # The prefix and the suffix are relative to the FIRST # variable text_extras = { "variable": matches.group("variable"), "prefix": matches.group("prefix"), "suffix": matches.group("suffix"), } # If the key is the one of the stringsdict defaults skip it continue openstring = self._handle_strings( value_tag, main_key, secondary_key, text_extras, ) if openstring is not None: # If an openstring was created append it to the list string_list.append(openstring) return string_list
def string_handler(token, template): """ Extra checks and manipulation of extracted string from markdown file. Parameters: token: Tuple of (string, string_type) where string_type refers to the type of markdown element this string belongs to. string_type can be None. template: the template of the resource returns: the manipulated string or None in case the manipulated string is not valid anymore e.g. empty string """ # Drop new lines around string. string, key = token string = string.strip('\n') # for code blocks we need to maintain the exact indentation as in # the source file both for matching the string and replacing it in the # template and for producing a valid markdown on compilation if key == 'block_code': lines = string.split('\n') line = lines[0] spaces = re.findall( ensure_unicode(r'\n( *){}'.format(re.escape(line))), template )[0] if spaces: string = '' for line in lines: line = '{}{}'.format(spaces, line) string += '\n' string += line # Line is a liquid template tag, ignore. if string.startswith('{%') and string.endswith('%}'): return # Drop # chars from beginning of the string match_header_line = re.search(ensure_unicode(r'^#+\s'), string) if match_header_line: return string.replace(match_header_line.group(), '') # Extract Text from `[Text](link)` or `"[Text](link)"` lines match_link = re.search(ensure_unicode(r'^"?\[(.+)\]\(.+\)"?$'), string) if match_link: # Get content between brackets return match_link.groups()[0] # Extract Text from `[Text]: link` or `"[Text]: link"` lines match_reference = re.search(ensure_unicode(r'^"?\[(.+)\]:.+"?$'), string) if match_reference: try: int(match_reference.groups()[0]) except ValueError: # Get content between brackets if it's not an integer number return match_reference.groups()[0] return # exclude numeric values from stringset try: float(string) return except ValueError: pass return string
class AndroidHandler(Handler): """A handler class that parses and compiles String Resources for ANDROID applications. The String Resources file is in XML format. String Resources file documentation can be found here: http://developer.android.com/guide/topics/resources/string-resource.html """ name = "ANDROID" extension = "xml" EXTRACTS_RAW = True SPECIFIER = re.compile( ensure_unicode( r'%((?:(?P<ord>\d+)\$|\((?P<key>\w+)\))?(?P<fullvar>[+#\- 0]*(?:\d+)?' r'(?:\.\d+)?(hh\|h\|l\|ll|j|z|t|L)?(?P<type>[diufFeEgGxXaAoscpn%])))' )) # Where to start parsing the file PARSE_START = "<resources" # Relevant tags STRING = "string" STRING_PLURAL = "plurals" STRING_ARRAY = "string-array" # Relevant children STRING_ITEM = "item" # Attributes that if the child contains it should be skipped SKIP_ATTRIBUTES = {'translatable': 'false'} # Compile plural template PLURAL_TEMPLATE = u'<item quantity="{rule}">{string}</item>' """ Parse Methods """ @reraise_syntax_as_parse_errors def parse(self, content, **kwargs): self.transcriber = Transcriber(content) self.current_comment = u"" self.order_counter = itertools.count() source = self.transcriber.source # Skip XML info declaration resources_tag_position = source.index(self.PARSE_START) parsed = DumbXml(source, resources_tag_position) XMLUtils.validate_no_text_characters(self.transcriber, parsed) XMLUtils.validate_no_tail_characters(self.transcriber, parsed) children_iterator = parsed.find_children(self.STRING, self.STRING_ARRAY, self.STRING_PLURAL, DumbXml.COMMENT) stringset = [] self.existing_hashes = {} for child in children_iterator: strings = self._handle_child(child) if strings is not None: stringset.extend(strings) self.current_comment = u"" self.transcriber.copy_until(len(source)) template = self.transcriber.get_destination() return template, stringset def _handle_child(self, child): """Do basic checks on the child and assigns the appropriate method to handle it based on the child's tag. :returns: An list of OpenString objects if any were created else None. """ XMLUtils.validate_no_tail_characters(self.transcriber, child) if not self._should_ignore(child): if child.tag == DumbXml.COMMENT: self._handle_comment(child) else: if child.tag == self.STRING: return self._handle_string(child) elif child.tag == self.STRING_ARRAY: XMLUtils.validate_no_text_characters( self.transcriber, child) return self._handle_string_array(child) elif child.tag == self.STRING_PLURAL: XMLUtils.validate_no_text_characters( self.transcriber, child) return self._handle_string_plural(child) else: self.current_comment = u"" return None def _handle_string(self, child): """Handles child element that has the `string` tag. If it contains a string it will create an OpenString object. :returns: An list of containing the OpenString object if one was created else it returns None. """ name, product = self._get_child_attributes(child) string = self._create_string(name, child.content, self.current_comment, product, child) if string is not None: # <string>My Text</string> # ^ self.transcriber.copy_until(child.text_position) self.transcriber.add(string.template_replacement) # <string>My Text</string> # ^ self.transcriber.skip(len(child.content)) return [string] return None def _handle_string_plural(self, child): """Handles child element that has the `plurals` tag. It will find children with the `item` tag and create an OpenString object out of them. :raises: Parse error if the `quantity` attribute is missing from any of the child's children :returns: An list containing the OpenString object if one was created else None. """ string_rules_text = {} item_iterator = child.find_children() # Iterate through the children with the item tag. for item_tag in item_iterator: if item_tag.tag != DumbXml.COMMENT: rule_number = self._validate_plural_item(item_tag) string_rules_text[rule_number] = item_tag.content name, product = self._get_child_attributes(child) string = self._create_string( name, string_rules_text, self.current_comment, product, child, # <plurals> tags always define plurals, even if the language has # one plural form and thus there's only one <item> pluralized=True, ) if string is not None: # <plurals name="foo"> <item>Hello ... # ^ first_plural_position = child.text_position + len(child.text or '') self.transcriber.copy_until(first_plural_position) self.transcriber.add(string.template_replacement) # ...</item> </plurals>... # ^ self.transcriber.skip_until(item_tag.tail_position) # FYI: item_tag is the last iterated item from the loop before. return [string] return None def _handle_string_array(self, child): """Handles child element that has the `string-array` tag. It will find children with the `item` tag and create an OpenString object out of each one of them. :returns: An list containing the OpenString objects if any were created else None. """ strings = [] item_iterator = child.find_children(self.STRING_ITEM) name, product = self._get_child_attributes(child) # Iterate through the children with the item tag. for index, item_tag in enumerate(item_iterator): XMLUtils.validate_no_tail_characters(self.transcriber, item_tag) child_name = u"{}[{}]".format(name, index) string = self._create_string(child_name, item_tag.content, self.current_comment, product, child) if string is not None: # ... <item>Hello... # ^ self.transcriber.copy_until(item_tag.text_position) strings.append(string) self.transcriber.add(string.template_replacement) # ...ello world</item>... # ^ self.transcriber.skip(len(item_tag.content)) if strings: return strings return None def _handle_comment(self, child): """Will assign the comment found as the current comment.""" self.current_comment = child.content def _create_string(self, name, text, comment, product, child, pluralized=False): """Creates a string and returns it. If empty string it returns None. :param text: The strings text. :param name: The name of the string. :param comment: The developer's comment the string might have. :param product: Extra context for the string. :param child: The child tag that the string is created from. Used to find line numbers when errors occur. :returns: Returns an OpenString object if the text is not empty else None. """ if XMLUtils.validate_not_empty_string(self.transcriber, text, child, error_context={ 'main_tag': 'plural', 'child_tag': 'item' }): if (name, product) in self.existing_hashes: if child.tag in self.existing_hashes[(name, product)]: format_dict = {'name': name, 'child_tag': child.tag} if product: msg = (u"Duplicate `tag_name` ({child_tag}) for `name`" u" ({name}) and `product` ({product}) " u"found on line {line_number}") format_dict['product'] = product else: msg = (u"Duplicate `tag_name` ({child_tag}) for `name`" u" ({name}) specify a product to differentiate") XMLUtils.raise_error(self.transcriber, child, msg, context=format_dict) else: product += child.tag # Create OpenString string = OpenString( name, text, context=product, order=next(self.order_counter), developer_comment=comment, pluralized=pluralized, ) self.existing_hashes.setdefault((name, product), []) self.existing_hashes[(name, product)].append(child.tag) return string return None def _validate_plural_item(self, item_tag): """ Performs a number of checks on the plural item to see its validity. :param item_tag: The item to perform the checks on. :raises: ParseError if the item tag does not meet the requirments. :returns: The plural number of the validated item tag. """ if item_tag.tag != self.STRING_ITEM: msg = (u"Wrong tag type found on line {line_number}. Was " u"expecting <item> but found <{wrong_tag}>") XMLUtils.raise_error(self.transcriber, item_tag, msg, context={'wrong_tag': item_tag.tag}) XMLUtils.validate_no_tail_characters(self.transcriber, item_tag) rule = item_tag.attrib.get('quantity') if rule is None: # If quantity is missing, the plural is unknown msg = u"Missing the `quantity` attribute on line {line_number}" XMLUtils.raise_error(self.transcriber, item_tag, msg) try: rule_number = self.get_rule_number(rule) except RuleError: msg = (u"The `quantity` attribute on line {line_number} contains " u"an invalid plural: `{rule}`") XMLUtils.raise_error(self.transcriber, item_tag, msg, context={'rule': rule}) return rule_number def _get_child_attributes(self, child): """Retrieves child's `name` and `product` attributes. :param child: The child to retrieve the attributes from. :returns: Returns a tuple (`name`, `product`) :raises: It raises a ParseError if no `name` attribute is present. """ name = child.attrib.get('name') if name is None: msg = u'Missing the `name` attribute on line {line_number}' XMLUtils.raise_error(self.transcriber, child, msg) name = name.\ replace(DumbXml.BACKSLASH, u''.join([DumbXml.BACKSLASH, DumbXml.BACKSLASH])).\ replace(u'[', u''.join([DumbXml.BACKSLASH, u'['])) product = child.attrib.get('product', '') return name, product """ Compile Methods """ def compile(self, template, stringset, is_source=True, language_info=None, **kwargs): resources_tag_position = template.index(self.PARSE_START) self.transcriber = Transcriber(template[resources_tag_position:]) source = self.transcriber.source parsed = DumbXml(source) # Check against 'tools:locale' attribute if language_info is not None and 'tools:locale' in parsed.attrib: value_position, value = next( ((value_position, value) for _, key, value_position, value in parsed.attributes if key == 'tools:locale')) self.transcriber.copy_until(value_position) self.transcriber.add(language_info['code']) self.transcriber.skip(len(value)) # This is needed in case the first tag is skipped to retain # the file's formating first_tag_position = parsed.text_position + len(parsed.text) self.transcriber.copy_until(first_tag_position) children_iterator = parsed.find_children(self.STRING, self.STRING_ARRAY, self.STRING_PLURAL) self.is_source = is_source self.stringset = iter(stringset) self.next_string = self._get_next_string() for child in children_iterator: self._compile_child(child) self.transcriber.copy_until(len(source)) compiled = (template[:resources_tag_position] + self.transcriber.get_destination()) return compiled def _compile_child(self, child): """Do basic checks on the child and assigns the appropriate method to handle it based on the child's tag. """ if not self._should_ignore(child): if child.tag == self.STRING: self._compile_string(child) elif child.tag == self.STRING_ARRAY: self._compile_string_array(child) elif child.tag == self.STRING_PLURAL: self._compile_string_plural(child) else: if self.is_source: self.transcriber.copy_until(child.end) else: self._skip_tag(child) def _compile_string(self, child): """Handles child element that has the `string` and `item` tag. It will compile the tag if matching string exists. Otherwise it will skip it. """ if self._should_compile(child): self.transcriber.copy_until(child.text_position) self.transcriber.add(self.next_string.string) self.transcriber.skip_until(child.content_end) self.transcriber.copy_until(child.tail_position) self.transcriber.mark_section_start() self.transcriber.copy_until(child.end) self.transcriber.mark_section_end() self.next_string = self._get_next_string() elif not child.text: # In the case of a string-array we don't want to skip an # empty array element that was initially empty. pass else: self._skip_tag(child) def _compile_string_array(self, child): """Handles child element that has the `string-array` tag. It will find children with the `item` tag that should be compiled and will compile them. If no matching string is found for a child it will remove it. If the `string-array` tag will be empty after compilation it will remove it as well. :NOTE: If the `string-array` was empty to begin with it will leave it as it is. """ item_iterator = list(child.find_children(self.STRING_ITEM)) # If placeholder (has no children) skip if len(item_iterator) == 0: self.transcriber.copy_until(child.end) return # Check if any string matches array items has_match = False for item_tag in item_iterator: if self._should_compile(item_tag): has_match = True break if has_match: # Make sure you include the <string-array> tag self.transcriber.copy_until(item_iterator[0].start) # Compile found item nodes. Remove the rest. for item_tag in item_iterator: self._compile_string(item_tag) self.transcriber.remove_section() self.transcriber.add(item_iterator[-1].tail) self.transcriber.copy_until(child.end) else: # Remove the `string-array` tag self._skip_tag(child) def _compile_string_plural(self, child): """Handles child element that has the `plurals` tag. It will check if pluralized string exists and add every plural as an `item` child. If no matching string is found it will remove the tag. :NOTE: If the `plurals` had empty `item` tags to begin with we leave it as it is. """ # If placeholder (has empty children) skip if len(list(child.find_children(self.STRING_ITEM))): return if self._should_compile(child): self.transcriber.copy_until(child.text_position) splited_content = child.content.split( self.next_string.template_replacement) start = splited_content[0] end = splited_content[1] # If newline formating if start.startswith(end): start = start.replace(end, '', 1) self.transcriber.add(end) for rule, string in six.iteritems(self.next_string.string): self.transcriber.add(start + self.PLURAL_TEMPLATE.format( rule=self.get_rule_string(rule), string=string) + end) self.transcriber.skip_until(child.content_end) self.transcriber.copy_until(child.end) self.next_string = self._get_next_string() else: self._skip_tag(child) def _should_compile(self, child): """Checks if the current child should be compiled. :param child: The child to check if it should be compiled. :returns: True if the child should be compiled else False. """ child_content = child.content and child.content.strip() or '' return (self.next_string is not None and self.next_string.template_replacement == child_content) def _skip_tag(self, tag): """Skips a tag from the compilation. :param tag: The tag to be skipped. """ self.transcriber.skip_until(tag.end) def _get_next_string(self): """Gets the next string from stringset itterable. :returns: An openstring object or None if it has reached the end of the itterable. """ try: next_string = next(self.stringset) except StopIteration: next_string = None return next_string """ Util Methods """ @staticmethod def _should_ignore(child): """Checks if the child contains any key:value pair from the SKIP_ATTRIBUTES dict. :returns: True if it contains any else false. """ for key, value in six.iteritems(AndroidHandler.SKIP_ATTRIBUTES): filter_attr = child.attrib.get(key) if filter_attr is not None and filter_attr == value: return True return False # Escaping / Unescaping # According to: # https://developer.android.com/guide/topics/resources/string-resource#FormattingAndStyling # https://developer.android.com/guide/topics/resources/string-resource#StylingWithHTML INLINE_TAGS = ("xliff:g", "a", "annotation", "b", "em", "i", "cite", "dfn", "big", "small", "font", "tt", "s", "strike", "del", "u", "sup", "sub", "ul", "li", "br", "div", "span", "p") @staticmethod def escape(string): """ Escape text for use in Android files. Respect tags that are allowed in strings. Examples: "hello" world => \\"hello\\" world <a b="c">hello</a> => <a b="c">hello</a> <x y="z">hello</x> => <x y=\\"z\\">hello</x> :param str string: string to be escaped :return: escaped string :rtype: unicode """ def _escape_text(string): # If the string starts with an at-sign that doesn't identify # another string, then we need to escape it using a leading # backslash if string.startswith(u'@') and not string.startswith(u'@string/'): string = string.replace(u'@', u'\\@', 1) return string.\ replace(DumbXml.DOUBLE_QUOTES, u''.join([DumbXml.BACKSLASH, DumbXml.DOUBLE_QUOTES])).\ replace(DumbXml.SINGLE_QUOTE, u''.join([DumbXml.BACKSLASH, DumbXml.SINGLE_QUOTE])) return xml_escape(string, AndroidHandler.INLINE_TAGS, _escape_text) @staticmethod def unescape(string): # If the string starts with an escaped at-sign, do not display the # backslash if string.startswith(u'\\@'): string = string[1:] if len(string) and string[0] == string[-1] == DumbXml.DOUBLE_QUOTES: return string[1:-1].\ replace(u''.join([DumbXml.BACKSLASH, DumbXml.DOUBLE_QUOTES]), DumbXml.DOUBLE_QUOTES) else: return string.\ replace(u''.join([DumbXml.BACKSLASH, DumbXml.SINGLE_QUOTE]), DumbXml.SINGLE_QUOTE).\ replace(u''.join([DumbXml.BACKSLASH, DumbXml.DOUBLE_QUOTES]), DumbXml.DOUBLE_QUOTES)
def parse(self, content, **kwargs): newline_type = find_newline_type(content) if newline_type == 'DOS': content = force_newline_type(content, 'UNIX') # mistune expands tabs to 4 spaces and trims trailing spaces, so we # need to do the same in order to be able to match the substrings template = content.expandtabs(4) pattern = re.compile(ensure_unicode(r'^ +$'), re.M) content = pattern.sub('', template) stringset = [] yml_header = re.match( ensure_unicode(r'^(---\s+)([\s\S]*?[^`]\s*)(\n---\s+)(?!-)'), content ) yaml_header_content = '' yaml_stringset = [] yaml_template = '' seperator = '' if yml_header: yaml_header_content = ''.join(yml_header.group(1, 2)) seperator = yml_header.group(3) md_content = content[len(yaml_header_content + seperator):] yaml_template, yaml_stringset = YamlHandler().parse( yaml_header_content ) for openstring in yaml_stringset: self._unescape_non_printable(openstring) else: md_content = content md_template = md_content block = TxBlockLexer() markdown = Markdown(block=block) # Making sure stringset is empty because of recursive inside `markdown` block.md_stringset = [] # Command that populates block.stringset var markdown(md_content) stringset.extend(yaml_stringset) order = len(stringset) curr_pos = 0 for string in block.md_stringset: string = string_handler(string, md_template) # Ignore any string that does not appear in the template, # We do this to avoid parsing strings that are not properly # handled by the Markdown library, such as ```code``` blocks if string and string in md_template[curr_pos:]: string_object = OpenString(six.text_type(order), string, order=order) order += 1 stringset.append(string_object) # Keep track of the index of the last replaced hash md_template = ( md_template[:curr_pos] + md_template[curr_pos:].replace( string, string_object.template_replacement, 1) ) curr_pos = md_template.find(string_object.template_replacement) curr_pos = curr_pos + len(string_object.template_replacement) template = yaml_template + seperator + md_template return force_newline_type(template, newline_type), stringset