def string_handler(token, template):
    """
    Extra checks and manipulation of extracted string from markdown file.
    Parameters:
    token: Tuple of (string, string_type) where string_type refers to the
           type of markdown element this string belongs to. string_type
           can be None.
    template: the template of the resource

    returns: the manipulated string or None in case the manipulated string
             is not valid anymore e.g. empty string
    """

    # Drop new lines around string.
    string, key = token
    string = string.strip('\n')
    # for code blocks we need to maintain the exact indentation as in
    # the source file both for matching the string and replacing it in the
    # template and for producing a valid markdown on compilation
    if key == 'block_code':
        lines = string.split('\n')
        line = lines[0]
        spaces = re.findall(
            ensure_unicode(r'\n( *){}').format(re.escape(line)), template)[0]
        if spaces:
            string = ''
            for line in lines:
                line = u'{}{}'.format(spaces, line)
                string += '\n'
                string += line

    # Line is a liquid template tag, ignore.
    if string.startswith('{%') and string.endswith('%}'):
        return

    # Drop # chars from beginning of the string
    match_header_line = re.search(ensure_unicode(r'^#+\s'), string)
    if match_header_line:
        return string.replace(match_header_line.group(), '')

    # Extract Text from `[Text]: link` or `"[Text]: link"` lines
    match_reference = re.search(ensure_unicode(r'^"?\[([^\[^\]]+)\]:.+"?$'),
                                string)
    if match_reference:
        try:
            int(match_reference.groups()[0])
        except ValueError:
            # Get content between brackets if it's not an integer number
            return match_reference.groups()[0]
        return

    # exclude numeric values from stringset
    try:
        float(string)
        return
    except ValueError:
        pass

    return string
    def parse(self, content, **kwargs):

        newline_type = find_newline_type(content)
        if newline_type == 'DOS':
            content = force_newline_type(content, 'UNIX')

        # mistune expands tabs to 4 spaces and trims trailing spaces, so we
        # need to do the same in order to be able to match the substrings
        template = content.expandtabs(4)
        pattern = re.compile(ensure_unicode(r'^ +$'), re.M)
        content = pattern.sub('', template)

        template = content
        stringset = []

        yml_header = re.match(
            ensure_unicode(r'^(---\s+)([\s\S]*?[^`])\s*(\n---\s+)(?!-)'),
            content
        )
        yaml_header_content = ''
        yaml_stringset = []
        if yml_header:
            yaml_header_content = yml_header.group()
            md_content = content[len(yaml_header_content):]
            yaml_stringset = self.yaml_parser(yaml_header_content)
        else:
            md_content = content

        block = TxBlockLexer()
        markdown = Markdown(block=block)

        # Making sure stringset is empty because of recursive inside `markdown`
        block.md_stringset = []

        # Command that populates block.stringset var
        markdown(md_content)

        order = 0
        curr_pos = 0
        for string in (yaml_stringset + block.md_stringset):
            string = string_handler(string, template)
            if string and string in template[curr_pos:]:
                string_object = OpenString(six.text_type(order),
                                           string,
                                           order=order)
                order += 1
                stringset.append(string_object)
                # Keep track of the index of the last replaced hash
                template = template[:curr_pos] + template[curr_pos:].replace(
                    string, string_object.template_replacement, 1
                )
                curr_pos = template.find(string_object.template_replacement)
                curr_pos = curr_pos + len(string_object.template_replacement)
        return force_newline_type(template, newline_type), stringset
    def parse(self, content, **kwargs):

        newline_type = find_newline_type(content)
        if newline_type == 'DOS':
            content = force_newline_type(content, 'UNIX')

        # mistune expands tabs to 4 spaces and trims trailing spaces, so we
        # need to do the same in order to be able to match the substrings
        template = content.expandtabs(4)
        pattern = re.compile(ensure_unicode(r'^ +$'), re.M)
        content = pattern.sub('', template)
        template = content
        stringset = []

        yml_header = re.match(
            ensure_unicode(r'^(---\s+)([\s\S]*?[^`])\s*(\n---\s+)(?!-)'),
            content)
        yaml_header_content = ''
        yaml_stringset = []
        if yml_header:
            yaml_header_content = yml_header.group()
            md_content = content[len(yaml_header_content):]
            yaml_stringset = self.yaml_parser(yaml_header_content)
        else:
            md_content = content

        block = TxBlockLexer()
        markdown = Markdown(block=block)

        # Making sure stringset is empty because of recursive inside `markdown`
        block.md_stringset = []

        # Command that populates block.stringset var
        markdown(md_content)

        order = 0
        curr_pos = 0
        for string in (yaml_stringset + block.md_stringset):
            string = string_handler(string, template)
            if string and string in template[curr_pos:]:
                string_object = OpenString(six.text_type(order),
                                           string,
                                           order=order)
                order += 1
                stringset.append(string_object)
                # Keep track of the index of the last replaced hash
                template = template[:curr_pos] + template[curr_pos:].replace(
                    string, string_object.template_replacement, 1)
                curr_pos = template.find(string_object.template_replacement)
                curr_pos = curr_pos + len(string_object.template_replacement)
        return force_newline_type(template, newline_type), stringset
Exemple #4
0
    def _compile_story(self, story_content):
        """ Handles the compilation of a single story
        args:
            story_content: the xml content of the story
        returns:
            compiled_story: the compiled story content
        """
        transcriber = Transcriber(story_content)
        hash_regex = re.compile(ensure_unicode(r'[a-z,0-9]{32}_tr'))
        found = True
        while found:
            try:
                current_string = self.stringset.pop(0)
                hash_position = story_content.index(
                    current_string.template_replacement)
            except ValueError:
                found = False
                self.stringset.insert(0, current_string)
            except IndexError:
                break
            else:
                transcriber.copy_until(hash_position)
                transcriber.add(self._escape_amps(current_string.string))
                transcriber.skip(len(current_string.template_replacement))

        # Update the XML file to contain the template strings
        transcriber.copy_until(len(story_content))
        compiled_story = transcriber.get_destination()
        # in case there are any hashes that have not been replaced, replace
        # them with an empty string
        compiled_story = hash_regex.sub(u'', compiled_story)
        return compiled_story
 def yaml_parser(self, yaml_header):
     # TODO: This is a temporary solution. Yaml header should be parsed
     # with an actual yaml parser when it is implemented in openformats
     yaml_strings = []
     block = False
     block_string = ''
     indent = 0
     for line in yaml_header.splitlines():
         # ignore comments
         if line.startswith('#'):
             continue
         if block:
             # at least 2 spaces more indented that the parent line
             if line.startswith(' ' * (indent + 2)):
                 block_string += line
                 block_string += '\n'
                 continue
             else:
                 yaml_strings.append((block_string, None))
                 block_string = ''
                 block = False
         key_value = line.split(':', 1)
         if len(key_value) == 2:
             value = key_value[1].strip()
             # we parse all the lines that follow the '|' or '>' symbols
             # and are at least 2 spaces more intented that the parent line
             # as one string
             if value and value in '|>[':
                 indent = len(
                     re.search(ensure_unicode(r'^( *)'),
                               key_value[0]).group(0))
                 block = True
                 continue
             yaml_strings.append((value, None))
     return yaml_strings
Exemple #6
0
    def find_closing(self, start):
        # assume start is on a '<'

        if self.content[start:start + 4] == "<!--":
            # Special case for comment
            closing_start = self.content[start:].index("-->")
            return start + closing_start, start + closing_start + 3

        opening_match = self.opening_tag_pat.search(self.content[start:])

        if self.single_tag_pat.search(opening_match.group()):
            # Single tag, eg `<foo a="b" />`
            return start + opening_match.end(), start + opening_match.end()

        tag_name = opening_match.groupdict()['name']
        tag_pat = re.compile(
            ensure_unicode(r'\<(?:(?:{tag_name})|(?:/{tag_name}\>))'.format(
                tag_name=re.escape(tag_name))))
        match_generator = tag_pat.finditer(self.content[start:])
        first_match = next(match_generator)
        assert first_match and first_match.start() == 0 and\
            first_match.group()[1] != '/'
        count = 1
        for match in match_generator:
            matched = match.group()
            if matched[1] == '/' or matched == "-->":
                # closing tag
                count -= 1
            else:
                count += 1

            if count == 0:
                return start + match.start(), start + match.end()
 def yaml_parser(self, yaml_header):
     # TODO: This is a temporary solution. Yaml header should be parsed
     # with an actual yaml parser when it is implemented in openformats
     yaml_strings = []
     block = False
     block_string = ''
     indent = 0
     for line in yaml_header.splitlines():
         # ignore comments
         if line.startswith('#'):
             continue
         if block:
             # at least 2 spaces more indented that the parent line
             if line.startswith(' ' * (indent + 2)):
                 block_string += line
                 block_string += '\n'
                 continue
             else:
                 yaml_strings.append((block_string, None))
                 block_string = ''
                 block = False
         key_value = line.split(':', 1)
         if len(key_value) == 2:
             value = key_value[1].strip()
             # we parse all the lines that follow the '|' or '>' symbols
             # and are at least 2 spaces more intented that the parent line
             # as one string
             if value and value in '|>[':
                 indent = len(re.search(ensure_unicode(r'^( *)'),
                                        key_value[0]).
                              group(0))
                 block = True
                 continue
             yaml_strings.append((value, None))
     return yaml_strings
Exemple #8
0
    def _compile_story(self, story_content):
        """ Handles the compilation of a single story
        args:
            story_content: the xml content of the story
        returns:
            compiled_story: the compiled story content
        """
        transcriber = Transcriber(story_content)
        hash_regex = re.compile(ensure_unicode(r'[a-z,0-9]{32}_tr'))
        found = True
        while found:
            try:
                current_string = self.stringset.pop(0)
                hash_position = story_content.index(
                    current_string.template_replacement
                )
            except ValueError:
                found = False
                self.stringset.insert(0, current_string)
            except IndexError:
                break
            else:
                transcriber.copy_until(hash_position)
                transcriber.add(current_string.string)
                transcriber.skip(len(current_string.template_replacement))

        # Update the XML file to contain the template strings
        transcriber.copy_until(len(story_content))
        compiled_story = transcriber.get_destination()
        # in case there are any hashes that have not been replaced, replace
        # them with an empty string
        compiled_story = hash_regex.sub(u'', compiled_story)
        return compiled_story
Exemple #9
0
    def find(self, tags=[]):
        if isinstance(tags, (six.binary_type, six.text_type)):
            tags = [tags]

        if not tags:
            pat = re.compile(ensure_unicode(r'\<'), re.DOTALL)
        else:
            pat = re.compile(
                ensure_unicode(r'\<(?:{})'.format('|'.join(
                    (re.escape(tag) for tag in tags)))), re.DOTALL)

        for match in pat.finditer(self.content):
            if match.start() == 0 or self._is_within_comment(match):
                continue
            closing_start, closing_end = self.find_closing(match.start())
            found = DumbXml(self.content[match.start():closing_end])
            if not tags or found.name in tags:
                offset = match.start()
                yield found, offset
Exemple #10
0
    def __init__(self, content):
        """
            Does some parsing and sets the following attributes to `self`:

            * content: The content of the tag, including the opening/closing
                tags
            * name: The name of the tag
            * attrs: A dictionary of all the attributes of the tag with their
                values
            * inner_offset: the place of the character where the inner content
                of the tag starts, aka the length of the opening tag
            * inner: the inner content of the tag
        """

        # Fix regex encoding
        self.opening_tag_pat = re.compile(ensure_unicode(self.OPENING_TAG_PAT),
                                          re.DOTALL)
        self.attr_pat = re.compile(ensure_unicode(self.ATTR_PAT))
        self.single_tag_pat = re.compile(ensure_unicode(self.SINGLE_TAG_PAT))

        self.content = content

        if self.content[:4] == "<!--":
            # Special case for comment
            self.inner_offset = 4
            self.name = self.COMMENT
            self.attrs = {}
            self.inner = self.content[4:self.content.index("-->")]
            return

        opening_match = self.opening_tag_pat.search(content)
        self.inner_offset = opening_match.end()
        self.name = opening_match.groupdict()['name']
        attrs = opening_match.groupdict()['attrs']
        self.attrs = {}
        for match in self.attr_pat.finditer(attrs):
            self.attrs[match.groupdict()['key']] = match.groupdict()['value']

        closing_start, closing_end = self.find_closing(0)

        self.inner = self.content[opening_match.end():closing_start]
Exemple #11
0
    def __init__(self, content):
        """
            Does some parsing and sets the following attributes to `self`:

            * content: The content of the tag, including the opening/closing
                tags
            * name: The name of the tag
            * attrs: A dictionary of all the attributes of the tag with their
                values
            * inner_offset: the place of the character where the inner content
                of the tag starts, aka the length of the opening tag
            * inner: the inner content of the tag
        """

        # Fix regex encoding
        self.opening_tag_pat = re.compile(ensure_unicode(self.OPENING_TAG_PAT),
                                          re.DOTALL)
        self.attr_pat = re.compile(ensure_unicode(self.ATTR_PAT))
        self.single_tag_pat = re.compile(ensure_unicode(self.SINGLE_TAG_PAT))

        self.content = content

        if self.content[:4] == "<!--":
            # Special case for comment
            self.inner_offset = 4
            self.name = self.COMMENT
            self.attrs = {}
            self.inner = self.content[4:self.content.index("-->")]
            return

        opening_match = self.opening_tag_pat.search(content)
        self.inner_offset = opening_match.end()
        self.name = opening_match.groupdict()['name']
        attrs = opening_match.groupdict()['attrs']
        self.attrs = {}
        for match in self.attr_pat.finditer(attrs):
            self.attrs[match.groupdict()['key']] = match.groupdict()['value']

        closing_start, closing_end = self.find_closing(0)

        self.inner = self.content[opening_match.end():closing_start]
Exemple #12
0
    def compile(self, template, stringset, **kwargs):
        # Fix regex encoding
        space_pattern = re.compile(ensure_unicode(self.SPACE_PAT))

        # assume stringset is ordered within the template
        transcriber = Transcriber(template)
        template = transcriber.source

        for string in stringset:
            hash_position = template.index(string.template_replacement)
            if not string.pluralized:
                transcriber.copy_until(hash_position)
                transcriber.add(string.string)
                transcriber.skip(len(string.template_replacement))
            else:
                # if the hash is on its own on a line with only spaces, we have
                # to remember it's indent
                indent_length = template[hash_position::-1].index('\n') - 1
                indent = template[hash_position - indent_length:hash_position]
                tail_length = template[
                    hash_position + len(string.template_replacement):
                ].index('\n')
                tail = template[
                    hash_position + len(string.template_replacement):
                    hash_position + len(string.template_replacement) +
                    tail_length
                ]
                if (space_pattern.search(indent) and
                        space_pattern.search(tail)):
                    transcriber.copy_until(hash_position - indent_length)
                    for rule, value in six.iteritems(string.string):
                        transcriber.add(
                            indent + self.plural_template.format(
                                rule=self.RULES_ITOA[rule], string=value
                            ) + tail + '\n'
                        )
                    transcriber.skip(indent_length +
                                     len(string.template_replacement) +
                                     tail_length + 1)
                else:
                    # string is not on its own, simply replace hash with all
                    # plural forms
                    transcriber.copy_until(hash_position)
                    for rule, value in six.iteritems(string.string):
                        transcriber.add(self.plural_template.format(
                            rule=self.RULES_ITOA[rule], string=value
                        ))
                    transcriber.skip(len(string.template_replacement))

        transcriber.copy_until(len(template))
        compiled = transcriber.get_destination()

        return compiled
Exemple #13
0
    def find(self, tags=[]):
        if isinstance(tags, (six.binary_type, six.text_type)):
            tags = [tags]

        if not tags:
            pat = re.compile(ensure_unicode(r'\<'), re.DOTALL)
        else:
            pat = re.compile(
                ensure_unicode(r'\<(?:{})'.
                               format('|'.join((re.escape(tag)
                                                for tag in tags)))),
                re.DOTALL
            )

        for match in pat.finditer(self.content):
            if match.start() == 0 or self._is_within_comment(match):
                continue
            closing_start, closing_end = self.find_closing(match.start())
            found = DumbXml(self.content[match.start():closing_end])
            if not tags or found.name in tags:
                offset = match.start()
                yield found, offset
Exemple #14
0
 def _find_and_replace(self, story_xml):
     """
     Finds all the translatable content in the given XML string
     replaces it with the string_hash and returns the resulting
     template while updating `self.stringset` in the process.
     args:
         story_xml (str): The xml content of a single Story of the IDML file
     returns:
         the input string with all translatable content replaced by the
         md5 hash of the string.
     """
     template = re.sub(ensure_unicode(self.CONTENT_REGEX), self._replace,
                       story_xml)
     return template
Exemple #15
0
    def compile(self, template, stringset, **kwargs):
        # Fix regex encoding
        space_pattern = re.compile(ensure_unicode(self.SPACE_PAT))

        # assume stringset is ordered within the template
        transcriber = Transcriber(template)
        template = transcriber.source

        for string in stringset:
            hash_position = template.index(string.template_replacement)
            if not string.pluralized:
                transcriber.copy_until(hash_position)
                transcriber.add(string.string)
                transcriber.skip(len(string.template_replacement))
            else:
                # if the hash is on its own on a line with only spaces, we have
                # to remember it's indent
                indent_length = template[hash_position::-1].index('\n') - 1
                indent = template[hash_position - indent_length:hash_position]
                tail_length = template[hash_position +
                                       len(string.template_replacement
                                           ):].index('\n')
                tail = template[hash_position + len(string.template_replacement
                                                    ):hash_position +
                                len(string.template_replacement) + tail_length]
                if (space_pattern.search(indent)
                        and space_pattern.search(tail)):
                    transcriber.copy_until(hash_position - indent_length)
                    for rule, value in six.iteritems(string.string):
                        transcriber.add(indent + self.plural_template.format(
                            rule=self.RULES_ITOA[rule], string=value) + tail +
                                        '\n')
                    transcriber.skip(indent_length +
                                     len(string.template_replacement) +
                                     tail_length + 1)
                else:
                    # string is not on its own, simply replace hash with all
                    # plural forms
                    transcriber.copy_until(hash_position)
                    for rule, value in six.iteritems(string.string):
                        transcriber.add(
                            self.plural_template.format(
                                rule=self.RULES_ITOA[rule], string=value))
                    transcriber.skip(len(string.template_replacement))

        transcriber.copy_until(len(template))
        compiled = transcriber.get_destination()

        return compiled
Exemple #16
0
 def _find_and_replace(self, story_xml):
     """
     Finds all the translatable content in the given XML string
     replaces it with the string_hash and returns the resulting
     template while updating `self.stringset` in the process.
     args:
         story_xml (str): The xml content of a single Story of the IDML file
     returns:
         the input string with all translatable content replaced by the
         md5 hash of the string.
     """
     template = re.sub(ensure_unicode(self.CONTENT_REGEX),
                       self._replace,
                       story_xml)
     return template
Exemple #17
0
    def _is_custom_tag(self, tag):
        """
        Check whether a value is tagged with a custom type.

        Detect custom tags, like:
            `foo: !bar test`
            `foo: !xml "<bar>Bar</bar>"`
        Built-in types, indicated by a `!!` prefix, will not be matched. We
        can't preserve the information whether a built-in tag like `!!str` was
        used for a value since the PyYAML library will tag such entries with
        the built-in identifier. For example `tag:yaml.org,2002:str`, not
        `!!str`.
        """
        return re.match(ensure_unicode(r'^[\![a-zA-Z_]*]*$'),
                        tag,
                        re.IGNORECASE)
Exemple #18
0
    def _is_custom_tag(self, tag):
        """
        Check whether a value is tagged with a custom type.

        Detect custom tags, like:
            `foo: !bar test`
            `foo: !xml "<bar>Bar</bar>"`
        Built-in types, indicated by a `!!` prefix, will not be matched. We
        can't preserve the information whether a built-in tag like `!!str` was
        used for a value since the PyYAML library will tag such entries with
        the built-in identifier. For example `tag:yaml.org,2002:str`, not
        `!!str`.
        """
        return re.match(ensure_unicode(r'^[\![a-zA-Z_]*]*$'),
                        tag,
                        re.IGNORECASE)
Exemple #19
0
 def _can_skip_content(self, string):
     """
     Checks if the contents of an XML files are translateable.
     Strings that contain only special characters or can be evaluated
     to a nunber are skipped.
     """
     stripped_string = re.\
         sub(ensure_unicode(self.SPECIAL_CHARACTERS_REGEX), u'', string).\
         strip()
     if not stripped_string:
         return True
     try:
         float(string.strip())
         return True
     except ValueError:
         pass
     if not self._contains_translatable_character(stripped_string):
         return True
     return False
Exemple #20
0
 def _can_skip_content(self, string):
     """
     Checks if the contents of an XML files are translateable.
     Strings that contain only special characters or can be evaluated
     to a nunber are skipped.
     """
     stripped_string = re.\
         sub(ensure_unicode(self.SPECIAL_CHARACTERS_REGEX), u'', string).\
         strip()
     if not stripped_string:
         return True
     try:
         float(string.strip())
         return True
     except ValueError:
         pass
     if not self._contains_translatable_character(stripped_string):
         return True
     return False
Exemple #21
0
    def parse(self, key, value):
        """
        Parse a string that follows a subset of the the ICU message format
        and return an ICUString object.

        For the time being, only the plurals format is supported.
        If `value` doesn't match the proper format, it will return None.
        This method will also update the transcriber accordingly.

        Note: if we want to support more ICU features in the future,
        this would probably have to be refactored.

        :param key: the string key
        :param value: the serialized string that has all the content,
            formatted like this (whitespace irrelevant):
            { item_count, plural,
                one { You have {file_count} file. }
                other { You have {file_count} files. }
            }
        :return: an ICUString object with all parsed information or None if
            the string is not in the supported ICU format
        :rtype: ICUString
        :raise ParseError: if the given string looks a lot like
            an ICU plural string but has an invalid structure
        """
        matches = re.match(
            ensure_unicode(
                r'\s*{\s*([A-Za-z-_\d]+)\s*,\s*([A-Za-z_]+)\s*,\s*(.*)}\s*'),
            value)
        if not matches:
            return None

        keyword, argument, serialized_strings = matches.groups()

        if argument == ICUParser.PLURAL_ARG:
            return self._parse_pluralized_string(
                key,
                keyword,
                value,
                serialized_strings,
            )

        return None
Exemple #22
0
    def parse(self, key, value):
        """
        Parse a string that follows a subset of the the ICU message format
        and return an ICUString object.

        For the time being, only the plurals format is supported.
        If `value` doesn't match the proper format, it will return None.
        This method will also update the transcriber accordingly.

        Note: if we want to support more ICU features in the future,
        this would probably have to be refactored.

        :param key: the string key
        :param value: the serialized string that has all the content,
            formatted like this (whitespace irrelevant):
            { item_count, plural,
                one { You have {file_count} file. }
                other { You have {file_count} files. }
            }
        :return: an ICUString object with all parsed information or None if
            the string is not in the supported ICU format
        :rtype: ICUString
        :raise ParseError: if the given string looks a lot like
            an ICU plural string but has an invalid structure
        """
        matches = re.match(
            ensure_unicode(
                r'\s*{\s*([A-Za-z-_\d]+)\s*,\s*([A-Za-z_]+)\s*,\s*(.*)}\s*'
            ),
            value
        )
        if not matches:
            return None

        keyword, argument, serialized_strings = matches.groups()

        if argument == ICUParser.PLURAL_ARG:
            return self._parse_pluralized_string(
                key, keyword, value, serialized_strings,
            )

        return None
Exemple #23
0
    def _get_indent(self, template):
        """
        Use a regular expression to figure out how many spaces are used
        for indentation in the original file.

        Args:
            template: The saved template
        Returns:
            The number of spaces.
        """
        # Match all whitespace characters after first `:` (end of first  key).
        # Stops on first non whitespace character.
        indent_pattern = re.compile(
            ensure_unicode(r':\r?\n(?P<indent>[ \t\n]+)'))
        m = indent_pattern.search(template)
        indent = m.groups('indent')[0] if m else ' ' * 2
        # keep only last line
        indent = indent.splitlines()[-1]
        indent = indent.replace('\t', ' ' * 4)
        return len(indent)
Exemple #24
0
    def _get_indent(self, template):
        """
        Use a regular expression to figure out how many spaces are used
        for indentation in the original file.

        Args:
            template: The saved template
        Returns:
            The number of spaces.
        """
        # Match all whitespace characters after first `:` (end of first  key).
        # Stops on first non whitespace character.
        indent_pattern = re.compile(
            ensure_unicode(r':\r?\n(?P<indent>[ \t\n]+)')
        )
        m = indent_pattern.search(template)
        indent = m.groups('indent')[0] if m else ' ' * 2
        # keep only last line
        indent = indent.splitlines()[-1]
        indent = indent.replace('\t', ' ' * 4)
        return len(indent)
Exemple #25
0
    def find_closing(self, start):
        # assume start is on a '<'

        if self.content[start:start + 4] == "<!--":
            # Special case for comment
            closing_start = self.content[start:].index("-->")
            return start + closing_start, start + closing_start + 3

        opening_match = self.opening_tag_pat.search(self.content[start:])

        if self.single_tag_pat.search(opening_match.group()):
            # Single tag, eg `<foo a="b" />`
            return start + opening_match.end(), start + opening_match.end()

        tag_name = opening_match.groupdict()['name']
        tag_pat = re.compile(
            ensure_unicode(
                r'\<(?:(?:{tag_name})|(?:/{tag_name}\>))'.
                format(tag_name=re.escape(tag_name))
            )
        )
        match_generator = tag_pat.finditer(self.content[start:])
        first_match = next(match_generator)
        assert first_match and first_match.start() == 0 and\
            first_match.group()[1] != '/'
        count = 1
        for match in match_generator:
            matched = match.group()
            if matched[1] == '/' or matched == "-->":
                # closing tag
                count -= 1
            else:
                count += 1

            if count == 0:
                return start + match.start(), start + match.end()
Exemple #26
0
class InDesignHandler(Handler):
    """A handler class that parses and compiles .idml files that are created
    in Adobe's InDesign.

    IDML files contain multiple XML fragments that can be parsed to extract
    strings from.
    """

    name = "InDesign"
    extension = "idml"
    SPECIFIER = None
    PROCESSES_BINARY = True
    EXTRACTS_RAW = False

    # The ? at the end of the string regex, makes it non-greedy in order to
    # allow trailing spaces to be preserved
    CONTENT_REGEX = r'(<Content>\s*)(.*?)(\s*</Content>)'
    SPECIAL_CHARACTERS_REGEX = re.compile(
        ensure_unicode(r'<\?ACE \d+\?>|<Br/>;'))
    """ Parse Methods """
    def __init__(self, *args, **kwargs):
        self.order = count()
        self.stringset = []
        super(InDesignHandler, self).__init__(*args, **kwargs)

    def parse(self, content, **kwargs):
        """ Parses .idml file content and returns the resource template and
            stringset.
            * Use UCF to unpack `content` to xml fragments
            * Parse all Story fragments to extract the translatable strings
              and replace them with a replacement hash
            * Pack the fragments back to create the template
            * Return the (template, stringset) tuple
        """

        idml = UCF(io.BytesIO(content))
        ordered_stories = self._get_ordered_stories(idml)

        # Iterate over the contents of the IDML file
        for key in ordered_stories:
            try:
                # No matter what, idml values are bytes
                story_content = idml[key].decode('utf8')
            except KeyError:
                continue
            story_content = self._find_and_replace(story_content)

            # Update the XML file to contain the template strings
            idml[key] = story_content.encode('utf-8')

        out = io.BytesIO()
        idml.save(out)
        template = out.getvalue()

        return template, self.stringset

    def _get_ordered_stories(self, idml):
        """
        Try to find the order the stories appear in the indesign document
        * Parse designmap.xml to get the StoryList attribute.
        * Return a list with the idml keys of the stories in the order they
          appear in StoryList
        """

        STORY_KEY = 'Stories/Story_{}.xml'
        BACKING_STORY = 'XML/BackingStory.xml'

        designmap = idml.get('designmap.xml')
        parser = etree.XMLParser(resolve_entities=False)
        designmap_tree = etree.fromstring(designmap, parser=parser)

        story_ids = designmap_tree.attrib.get("StoryList", "").split()
        story_keys = [STORY_KEY.format(s) for s in story_ids]

        # In case there are stories that is not referenced in designmap.xml,
        # append them at the end of the list
        all_stories = {
            k
            for k in six.iterkeys(idml)
            if k.startswith('Stories') or k == BACKING_STORY
        }
        story_keys.extend(all_stories - set(story_keys))
        return story_keys

    def _can_skip_content(self, string):
        """
        Checks if the contents of an XML files are translateable.
        Strings that contain only special characters or can be evaluated
        to a nunber are skipped.
        """
        stripped_string = re.\
            sub(ensure_unicode(self.SPECIAL_CHARACTERS_REGEX), u'', string).\
            strip()
        if not stripped_string:
            return True
        try:
            float(string.strip())
            return True
        except ValueError:
            pass
        if not self._contains_translatable_character(stripped_string):
            return True
        return False

    def _contains_translatable_character(self, string):
        """
        Checks if a string contains at least one character that can be
        translated. We assume that translatable characters are the letters,
        the symbols and the punctuation.
        """
        acceptable = ["L", "P", "S"]

        for letter in string:
            char_type = unicodedata.category(letter)
            if char_type[0] in acceptable:
                return True
        return False

    def _find_and_replace(self, story_xml):
        """
        Finds all the translatable content in the given XML string
        replaces it with the string_hash and returns the resulting
        template while updating `self.stringset` in the process.
        args:
            story_xml (str): The xml content of a single Story of the IDML file
        returns:
            the input string with all translatable content replaced by the
            md5 hash of the string.
        """
        template = re.sub(ensure_unicode(self.CONTENT_REGEX), self._replace,
                          story_xml)
        return template

    def _replace(self, match):
        """ Implements the logic used by `self.CONTENT_REGEX.sub(...)` to
        replace strings with their template replacement and appends new strings
        to `self.stringset`.
        """
        opening_tag, string, closing_tag = match.groups()

        if self._can_skip_content(string):
            return match.group()
        order = next(self.order)
        string_object = OpenString(six.text_type(order), string, order=order)
        self.stringset.append(string_object)
        return u"".join(
            (opening_tag, string_object.template_replacement, closing_tag))

    """ Compile Methods """

    def compile(self, template, stringset, **kwargs):
        # The content is a binary IDML file
        idml = UCF(io.BytesIO(template))

        self.stringset = list(stringset)

        # Iterate over the contents of the IDML file
        for key in self._get_ordered_stories(idml):
            try:
                story_content = idml[key]
            except KeyError:
                continue

            # no matter what, idml values are bytes
            story_content = idml[key].decode('utf-8')
            idml[key] = self._compile_story(story_content).encode('utf-8')

        out = io.BytesIO()
        idml.save(out)
        return out.getvalue()

    def _compile_story(self, story_content):
        """ Handles the compilation of a single story
        args:
            story_content: the xml content of the story
        returns:
            compiled_story: the compiled story content
        """
        transcriber = Transcriber(story_content)
        hash_regex = re.compile(ensure_unicode(r'[a-z,0-9]{32}_tr'))
        found = True
        while found:
            try:
                current_string = self.stringset.pop(0)
                hash_position = story_content.index(
                    current_string.template_replacement)
            except ValueError:
                found = False
                self.stringset.insert(0, current_string)
            except IndexError:
                break
            else:
                transcriber.copy_until(hash_position)
                transcriber.add(self._escape_amps(current_string.string))
                transcriber.skip(len(current_string.template_replacement))

        # Update the XML file to contain the template strings
        transcriber.copy_until(len(story_content))
        compiled_story = transcriber.get_destination()
        # in case there are any hashes that have not been replaced, replace
        # them with an empty string
        compiled_story = hash_regex.sub(u'', compiled_story)
        return compiled_story

    @staticmethod
    def _escape_amps(string):
        """ Escape "lonely" `&` (ampersands).

            If a valid XML escape sequence is found, it is left as it is.
            Otherwise, any occurrences of `&` are replaced with `&amp;`. Eg,

            "hello world"         -> "hello world"
            "hello &world"        -> "hello &amp;world"
            "hello &amp;world"    -> "hello &amp;world"
            "hello &lt;world"     -> "hello &lt;world"
            "hello &#x0a1f;world" -> "hello &#x0a1f;world"
            "&&#x05af;&&"         -> "&amp;&#x05af;&amp;&amp;"
        """

        # Find "lonely" ampersand positions by finding all ampersand positions
        # and subtracting the positions of ampersands that are part of valid
        # XML escape sequences
        all_amp_positions = {
            match.span()[0]
            for match in re.finditer(r'&', string)
        }
        escaped_amp_positions = {
            match.span()[0]
            for match in re.finditer(
                r'&(lt|gt|amp|apos|quot|#\d+|#x[0-9a-fA-F]+);', string)
        }
        target_positions = sorted(all_amp_positions - escaped_amp_positions)

        # Use Transcriber to replace lonely ampersands with '&amp;'
        transcriber = Transcriber(string)
        for position in target_positions:
            transcriber.copy_until(position)
            transcriber.add('&amp;')
            transcriber.skip(1)
        transcriber.copy_to_end()
        return transcriber.get_destination()
Exemple #27
0
 def __init__(self, *args, **kwargs):
     super(TxYamlLoader, self).__init__(*args, **kwargs)
     self.stream = args[0]
     self.post_block_comment_pattern = re.compile(
         ensure_unicode(r'(?:#.*\r?\n\s*)+$')
     )
    def parse(self, content, **kwargs):
        newline_type = find_newline_type(content)
        if newline_type == 'DOS':
            content = force_newline_type(content, 'UNIX')

        # mistune expands tabs to 4 spaces and trims trailing spaces, so we
        # need to do the same in order to be able to match the substrings
        template = content.expandtabs(4)
        pattern = re.compile(ensure_unicode(r'^ +$'), re.M)
        content = pattern.sub('', template)

        stringset = []

        yml_header = re.match(
            ensure_unicode(r'^(---\s+)([\s\S]*?[^`]\s*)(\n---\s+)(?!-)'),
            content
        )
        yaml_header_content = ''
        yaml_stringset = []
        yaml_template = ''
        seperator = ''
        if yml_header:
            yaml_header_content = ''.join(yml_header.group(1, 2))
            seperator = yml_header.group(3)
            md_content = content[len(yaml_header_content + seperator):]
            yaml_template, yaml_stringset = YamlHandler().parse(
                yaml_header_content)
        else:
            md_content = content

        md_template = md_content

        block = TxBlockLexer()
        markdown = Markdown(block=block)

        # Making sure stringset is empty because of recursive inside `markdown`
        block.md_stringset = []

        # Command that populates block.stringset var
        markdown(md_content)

        stringset.extend(yaml_stringset)
        order = len(stringset)
        curr_pos = 0
        for string in block.md_stringset:
            string = string_handler(string, md_template)
            # Ignore any string that does not appear in the template,
            # We do this to avoid parsing strings that are not properly
            # handled by the Markdown library, such as ```code``` blocks
            if string and string in md_template[curr_pos:]:
                string_object = OpenString(six.text_type(order),
                                           string,
                                           order=order)
                order += 1
                stringset.append(string_object)
                # Keep track of the index of the last replaced hash
                md_template = (
                    md_template[:curr_pos] + md_template[curr_pos:].replace(
                        string, string_object.template_replacement, 1)
                )

                curr_pos = md_template.find(string_object.template_replacement)
                curr_pos = curr_pos + len(string_object.template_replacement)

        template = yaml_template + seperator + md_template
        return force_newline_type(template, newline_type), stringset
Exemple #29
0
    def _handle_child_pairs(self, key_tag, dict_tag):
        """Handles the <key> tag and its <dict> value tag.

        Note that in order to avoid splitting strings we perform the following
        inline-replacement:

            <key>NSStringLocalizedFormatKey</key>
            <string>Look! There %#@mouse@ there</string>
            <key>mouse</key>
            <dict>
                <key>NSStringFormatSpecTypeKey</key>
                <string>NSStringPluralRuleType</string>
                <key>NSStringFormatValueTypeKey</key>
                <string>d</string>
                <key>one</key>
                <string>is a mouse</string>
                <key>other</key>
                <string>are %d mice</string>
            </dict>

        Becomes:

            <key>NSStringLocalizedFormatKey</key>
            <string>%#@mouse@</string>
            <key>mouse</key>
            <dict>
                <key>NSStringFormatSpecTypeKey</key>
                <string>NSStringPluralRuleType</string>
                <key>NSStringFormatValueTypeKey</key>
                <string>d</string>
                <key>one</key>
                <string>Look! There is a mouse there</string>
                <key>other</key>
                <string>Look! There are %d mice there</string>
            </dict>

        This is necessary to avoid splitting sentences in Transifex, or omit
        parts of the translatable content. We reference this inline-replacement
        in the comments below as [1].

        :param key_tag: The <key> tag to be handled.
        :param dict_tag: The <dict> tag to be handled.
        :returns: A list containing the openstrings created. If no strings were
                    created the list is empty.
        """

        # The first key tag contains the main key
        main_key = self._handle_key(key_tag, main_key=True)
        dict_iterator = self._handle_dict(dict_tag)

        string_list = []
        # A helper variable to save the prefix and suffix needed for the
        # inline-replacement [1].
        text_extras = None

        for key_child in dict_iterator:
            # The second key contains the secondary key.
            secondary_key = self._handle_key(key_child)
            value_tag = self._get_key_value(dict_iterator, key_child)
            if secondary_key == self.KEY_FORMAT:
                matches = re.match(ensure_unicode(self.VALUE_CONTENT_RE),
                                   value_tag.content)
                if matches is not None:
                    # The prefix and the suffix are relative to the FIRST
                    # variable
                    text_extras = {
                        "variable": matches.group("variable"),
                        "prefix": matches.group("prefix"),
                        "suffix": matches.group("suffix"),
                    }
                # If the key is the one of the stringsdict defaults skip it
                continue

            openstring = self._handle_strings(
                value_tag,
                main_key,
                secondary_key,
                text_extras,
            )
            if openstring is not None:
                # If an openstring was created append it to the list
                string_list.append(openstring)
        return string_list
def string_handler(token, template):
    """
    Extra checks and manipulation of extracted string from markdown file.
    Parameters:
    token: Tuple of (string, string_type) where string_type refers to the
           type of markdown element this string belongs to. string_type
           can be None.
    template: the template of the resource

    returns: the manipulated string or None in case the manipulated string
             is not valid anymore e.g. empty string
    """

    # Drop new lines around string.
    string, key = token
    string = string.strip('\n')

    # for code blocks we need to maintain the exact indentation as in
    # the source file both for matching the string and replacing it in the
    # template and for producing a valid markdown on compilation
    if key == 'block_code':
        lines = string.split('\n')
        line = lines[0]
        spaces = re.findall(
            ensure_unicode(r'\n( *){}'.format(re.escape(line))),
            template
        )[0]
        if spaces:
            string = ''
            for line in lines:
                line = '{}{}'.format(spaces, line)
                string += '\n'
                string += line

    # Line is a liquid template tag, ignore.
    if string.startswith('{%') and string.endswith('%}'):
        return

    # Drop # chars from beginning of the string
    match_header_line = re.search(ensure_unicode(r'^#+\s'), string)
    if match_header_line:
        return string.replace(match_header_line.group(), '')

    # Extract Text from `[Text](link)` or `"[Text](link)"` lines
    match_link = re.search(ensure_unicode(r'^"?\[(.+)\]\(.+\)"?$'), string)
    if match_link:
        # Get content between brackets
        return match_link.groups()[0]

    # Extract Text from `[Text]: link` or `"[Text]: link"` lines
    match_reference = re.search(ensure_unicode(r'^"?\[(.+)\]:.+"?$'), string)
    if match_reference:
        try:
            int(match_reference.groups()[0])
        except ValueError:
            # Get content between brackets if it's not an integer number
            return match_reference.groups()[0]
        return

    # exclude numeric values from stringset
    try:
        float(string)
        return
    except ValueError:
        pass

    return string
Exemple #31
0
 def __init__(self, *args, **kwargs):
     super(TxYamlLoader, self).__init__(*args, **kwargs)
     self.stream = args[0]
     self.post_block_comment_pattern = re.compile(
         ensure_unicode(r'(?:#.*\r?\n\s*)+$')
     )
Exemple #32
0
class AndroidHandler(Handler):
    """A handler class that parses and compiles String Resources for ANDROID
    applications. The String Resources file is in XML format.

    String Resources file documentation can be found here:
    http://developer.android.com/guide/topics/resources/string-resource.html
    """

    name = "ANDROID"
    extension = "xml"

    EXTRACTS_RAW = True

    SPECIFIER = re.compile(
        ensure_unicode(
            r'%((?:(?P<ord>\d+)\$|\((?P<key>\w+)\))?(?P<fullvar>[+#\- 0]*(?:\d+)?'
            r'(?:\.\d+)?(hh\|h\|l\|ll|j|z|t|L)?(?P<type>[diufFeEgGxXaAoscpn%])))'
        ))

    # Where to start parsing the file
    PARSE_START = "<resources"

    # Relevant tags
    STRING = "string"
    STRING_PLURAL = "plurals"
    STRING_ARRAY = "string-array"

    # Relevant children
    STRING_ITEM = "item"

    # Attributes that if the child contains it should be skipped
    SKIP_ATTRIBUTES = {'translatable': 'false'}

    # Compile plural template
    PLURAL_TEMPLATE = u'<item quantity="{rule}">{string}</item>'
    """ Parse Methods """
    @reraise_syntax_as_parse_errors
    def parse(self, content, **kwargs):
        self.transcriber = Transcriber(content)
        self.current_comment = u""
        self.order_counter = itertools.count()

        source = self.transcriber.source
        # Skip XML info declaration
        resources_tag_position = source.index(self.PARSE_START)

        parsed = DumbXml(source, resources_tag_position)
        XMLUtils.validate_no_text_characters(self.transcriber, parsed)
        XMLUtils.validate_no_tail_characters(self.transcriber, parsed)
        children_iterator = parsed.find_children(self.STRING,
                                                 self.STRING_ARRAY,
                                                 self.STRING_PLURAL,
                                                 DumbXml.COMMENT)
        stringset = []
        self.existing_hashes = {}
        for child in children_iterator:
            strings = self._handle_child(child)
            if strings is not None:
                stringset.extend(strings)
                self.current_comment = u""

        self.transcriber.copy_until(len(source))
        template = self.transcriber.get_destination()

        return template, stringset

    def _handle_child(self, child):
        """Do basic checks on the child and assigns the appropriate method to
            handle it based on the child's tag.

        :returns: An list of OpenString objects if any were created else None.
        """
        XMLUtils.validate_no_tail_characters(self.transcriber, child)
        if not self._should_ignore(child):
            if child.tag == DumbXml.COMMENT:
                self._handle_comment(child)
            else:
                if child.tag == self.STRING:
                    return self._handle_string(child)
                elif child.tag == self.STRING_ARRAY:
                    XMLUtils.validate_no_text_characters(
                        self.transcriber, child)
                    return self._handle_string_array(child)
                elif child.tag == self.STRING_PLURAL:
                    XMLUtils.validate_no_text_characters(
                        self.transcriber, child)
                    return self._handle_string_plural(child)
        else:
            self.current_comment = u""
        return None

    def _handle_string(self, child):
        """Handles child element that has the `string` tag.

        If it contains a string it will create an OpenString object.

        :returns: An list of containing the OpenString object
                    if one was created else it returns None.
        """
        name, product = self._get_child_attributes(child)
        string = self._create_string(name, child.content, self.current_comment,
                                     product, child)
        if string is not None:
            # <string>My Text</string>
            #         ^
            self.transcriber.copy_until(child.text_position)
            self.transcriber.add(string.template_replacement)
            # <string>My Text</string>
            #                ^
            self.transcriber.skip(len(child.content))
            return [string]
        return None

    def _handle_string_plural(self, child):
        """Handles child element that has the `plurals` tag.

        It will find children with the `item` tag and create an OpenString
        object out of them.

        :raises: Parse error if the `quantity` attribute is missing from any
                    of the child's children
        :returns: An list containing the OpenString object if one was created
                    else None.
        """

        string_rules_text = {}
        item_iterator = child.find_children()
        # Iterate through the children with the item tag.
        for item_tag in item_iterator:
            if item_tag.tag != DumbXml.COMMENT:
                rule_number = self._validate_plural_item(item_tag)
                string_rules_text[rule_number] = item_tag.content

        name, product = self._get_child_attributes(child)
        string = self._create_string(
            name,
            string_rules_text,
            self.current_comment,
            product,
            child,
            # <plurals> tags always define plurals, even if the language has
            # one plural form and thus there's only one <item>
            pluralized=True,
        )
        if string is not None:
            # <plurals name="foo">   <item>Hello ...
            #                        ^
            first_plural_position = child.text_position + len(child.text or '')
            self.transcriber.copy_until(first_plural_position)
            self.transcriber.add(string.template_replacement)
            # ...</item>   </plurals>...
            #           ^
            self.transcriber.skip_until(item_tag.tail_position)
            # FYI: item_tag is the last iterated item from the loop before.
            return [string]
        return None

    def _handle_string_array(self, child):
        """Handles child element that has the `string-array` tag.

        It will find children with the `item` tag and create an OpenString
        object out of each one of them.

        :returns: An list containing the OpenString objects if any were created
                    else None.
        """
        strings = []
        item_iterator = child.find_children(self.STRING_ITEM)
        name, product = self._get_child_attributes(child)
        # Iterate through the children with the item tag.
        for index, item_tag in enumerate(item_iterator):
            XMLUtils.validate_no_tail_characters(self.transcriber, item_tag)
            child_name = u"{}[{}]".format(name, index)
            string = self._create_string(child_name, item_tag.content,
                                         self.current_comment, product, child)

            if string is not None:
                # ... <item>Hello...
                #           ^
                self.transcriber.copy_until(item_tag.text_position)

                strings.append(string)
                self.transcriber.add(string.template_replacement)

                # ...ello world</item>...
                #              ^
                self.transcriber.skip(len(item_tag.content))
        if strings:
            return strings
        return None

    def _handle_comment(self, child):
        """Will assign the comment found as the current comment."""
        self.current_comment = child.content

    def _create_string(self,
                       name,
                       text,
                       comment,
                       product,
                       child,
                       pluralized=False):
        """Creates a string and returns it. If empty string it returns None.

        :param text: The strings text.
        :param name: The name of the string.
        :param comment: The developer's comment the string might have.
        :param product: Extra context for the string.
        :param child: The child tag that the string is created from.
                        Used to find line numbers when errors occur.
        :returns: Returns an OpenString object if the text is not empty
                  else None.
        """
        if XMLUtils.validate_not_empty_string(self.transcriber,
                                              text,
                                              child,
                                              error_context={
                                                  'main_tag': 'plural',
                                                  'child_tag': 'item'
                                              }):
            if (name, product) in self.existing_hashes:
                if child.tag in self.existing_hashes[(name, product)]:
                    format_dict = {'name': name, 'child_tag': child.tag}
                    if product:
                        msg = (u"Duplicate `tag_name` ({child_tag}) for `name`"
                               u" ({name}) and `product` ({product}) "
                               u"found on line {line_number}")
                        format_dict['product'] = product
                    else:
                        msg = (u"Duplicate `tag_name` ({child_tag}) for `name`"
                               u" ({name}) specify a product to differentiate")
                    XMLUtils.raise_error(self.transcriber,
                                         child,
                                         msg,
                                         context=format_dict)
                else:
                    product += child.tag
            # Create OpenString
            string = OpenString(
                name,
                text,
                context=product,
                order=next(self.order_counter),
                developer_comment=comment,
                pluralized=pluralized,
            )
            self.existing_hashes.setdefault((name, product), [])
            self.existing_hashes[(name, product)].append(child.tag)
            return string
        return None

    def _validate_plural_item(self, item_tag):
        """ Performs a number of checks on the plural item to see its validity.

        :param item_tag: The item to perform the checks on.
        :raises: ParseError if the item tag does not meet the requirments.
        :returns: The plural number of the validated item tag.
        """
        if item_tag.tag != self.STRING_ITEM:
            msg = (u"Wrong tag type found on line {line_number}. Was "
                   u"expecting <item> but found <{wrong_tag}>")
            XMLUtils.raise_error(self.transcriber,
                                 item_tag,
                                 msg,
                                 context={'wrong_tag': item_tag.tag})

        XMLUtils.validate_no_tail_characters(self.transcriber, item_tag)

        rule = item_tag.attrib.get('quantity')
        if rule is None:
            # If quantity is missing, the plural is unknown
            msg = u"Missing the `quantity` attribute on line {line_number}"
            XMLUtils.raise_error(self.transcriber, item_tag, msg)
        try:
            rule_number = self.get_rule_number(rule)
        except RuleError:
            msg = (u"The `quantity` attribute on line {line_number} contains "
                   u"an invalid plural: `{rule}`")
            XMLUtils.raise_error(self.transcriber,
                                 item_tag,
                                 msg,
                                 context={'rule': rule})
        return rule_number

    def _get_child_attributes(self, child):
        """Retrieves child's `name` and `product` attributes.

        :param child: The child to retrieve the attributes from.
        :returns: Returns a tuple (`name`, `product`)
        :raises: It raises a ParseError if no `name` attribute is present.
        """
        name = child.attrib.get('name')
        if name is None:
            msg = u'Missing the `name` attribute on line {line_number}'
            XMLUtils.raise_error(self.transcriber, child, msg)
        name = name.\
            replace(DumbXml.BACKSLASH,
                    u''.join([DumbXml.BACKSLASH, DumbXml.BACKSLASH])).\
            replace(u'[', u''.join([DumbXml.BACKSLASH, u'[']))

        product = child.attrib.get('product', '')
        return name, product

    """ Compile Methods """

    def compile(self,
                template,
                stringset,
                is_source=True,
                language_info=None,
                **kwargs):
        resources_tag_position = template.index(self.PARSE_START)

        self.transcriber = Transcriber(template[resources_tag_position:])
        source = self.transcriber.source

        parsed = DumbXml(source)

        # Check against 'tools:locale' attribute
        if language_info is not None and 'tools:locale' in parsed.attrib:
            value_position, value = next(
                ((value_position, value)
                 for _, key, value_position, value in parsed.attributes
                 if key == 'tools:locale'))
            self.transcriber.copy_until(value_position)
            self.transcriber.add(language_info['code'])
            self.transcriber.skip(len(value))

        # This is needed in case the first tag is skipped to retain
        # the file's formating
        first_tag_position = parsed.text_position + len(parsed.text)
        self.transcriber.copy_until(first_tag_position)

        children_iterator = parsed.find_children(self.STRING,
                                                 self.STRING_ARRAY,
                                                 self.STRING_PLURAL)

        self.is_source = is_source
        self.stringset = iter(stringset)
        self.next_string = self._get_next_string()
        for child in children_iterator:
            self._compile_child(child)

        self.transcriber.copy_until(len(source))
        compiled = (template[:resources_tag_position] +
                    self.transcriber.get_destination())

        return compiled

    def _compile_child(self, child):
        """Do basic checks on the child and assigns the appropriate method to
            handle it based on the child's tag.
        """

        if not self._should_ignore(child):
            if child.tag == self.STRING:
                self._compile_string(child)
            elif child.tag == self.STRING_ARRAY:
                self._compile_string_array(child)
            elif child.tag == self.STRING_PLURAL:
                self._compile_string_plural(child)
        else:
            if self.is_source:
                self.transcriber.copy_until(child.end)
            else:
                self._skip_tag(child)

    def _compile_string(self, child):
        """Handles child element that has the `string` and `item` tag.

        It will compile the tag if matching string exists. Otherwise it will
        skip it.
        """
        if self._should_compile(child):
            self.transcriber.copy_until(child.text_position)
            self.transcriber.add(self.next_string.string)
            self.transcriber.skip_until(child.content_end)
            self.transcriber.copy_until(child.tail_position)
            self.transcriber.mark_section_start()
            self.transcriber.copy_until(child.end)
            self.transcriber.mark_section_end()
            self.next_string = self._get_next_string()
        elif not child.text:
            # In the case of a string-array we don't want to skip an
            # empty array element that was initially empty.
            pass
        else:
            self._skip_tag(child)

    def _compile_string_array(self, child):
        """Handles child element that has the `string-array` tag.

        It will find children with the `item` tag that should be compiled and
        will compile them. If no matching string is found for a child it will
        remove it. If the `string-array` tag will be empty after compilation
        it will remove it as well.

        :NOTE: If the `string-array` was empty to begin with it will leave it
                as it is.
        """
        item_iterator = list(child.find_children(self.STRING_ITEM))

        # If placeholder (has no children) skip
        if len(item_iterator) == 0:
            self.transcriber.copy_until(child.end)
            return

        # Check if any string matches array items
        has_match = False
        for item_tag in item_iterator:
            if self._should_compile(item_tag):
                has_match = True
                break

        if has_match:
            # Make sure you include the <string-array> tag
            self.transcriber.copy_until(item_iterator[0].start)
            # Compile found item nodes. Remove the rest.
            for item_tag in item_iterator:
                self._compile_string(item_tag)
            self.transcriber.remove_section()
            self.transcriber.add(item_iterator[-1].tail)
            self.transcriber.copy_until(child.end)
        else:
            # Remove the `string-array` tag
            self._skip_tag(child)

    def _compile_string_plural(self, child):
        """Handles child element that has the `plurals` tag.

        It will check if pluralized string exists and add every plural as an
        `item` child. If no matching string is found it will remove the tag.

        :NOTE: If the `plurals` had empty `item` tags to begin with we leave
                it as it is.
        """
        # If placeholder (has empty children) skip
        if len(list(child.find_children(self.STRING_ITEM))):
            return

        if self._should_compile(child):
            self.transcriber.copy_until(child.text_position)

            splited_content = child.content.split(
                self.next_string.template_replacement)
            start = splited_content[0]
            end = splited_content[1]

            # If newline formating
            if start.startswith(end):
                start = start.replace(end, '', 1)
                self.transcriber.add(end)

            for rule, string in six.iteritems(self.next_string.string):
                self.transcriber.add(start + self.PLURAL_TEMPLATE.format(
                    rule=self.get_rule_string(rule), string=string) + end)
            self.transcriber.skip_until(child.content_end)
            self.transcriber.copy_until(child.end)
            self.next_string = self._get_next_string()
        else:
            self._skip_tag(child)

    def _should_compile(self, child):
        """Checks if the current child should be compiled.

        :param child: The child to check if it should be compiled.
        :returns: True if the child should be compiled else False.
        """
        child_content = child.content and child.content.strip() or ''
        return (self.next_string is not None
                and self.next_string.template_replacement == child_content)

    def _skip_tag(self, tag):
        """Skips a tag from the compilation.

        :param tag: The tag to be skipped.
        """
        self.transcriber.skip_until(tag.end)

    def _get_next_string(self):
        """Gets the next string from stringset itterable.

        :returns: An openstring object or None if it has reached the end of
                    the itterable.
        """
        try:
            next_string = next(self.stringset)
        except StopIteration:
            next_string = None
        return next_string

    """ Util Methods """

    @staticmethod
    def _should_ignore(child):
        """Checks if the child contains any key:value pair from the
            SKIP_ATTRIBUTES dict.

        :returns: True if it contains any else false.
        """
        for key, value in six.iteritems(AndroidHandler.SKIP_ATTRIBUTES):
            filter_attr = child.attrib.get(key)
            if filter_attr is not None and filter_attr == value:
                return True
        return False

    # Escaping / Unescaping
    # According to:
    # https://developer.android.com/guide/topics/resources/string-resource#FormattingAndStyling
    # https://developer.android.com/guide/topics/resources/string-resource#StylingWithHTML
    INLINE_TAGS = ("xliff:g", "a", "annotation", "b", "em", "i", "cite", "dfn",
                   "big", "small", "font", "tt", "s", "strike", "del", "u",
                   "sup", "sub", "ul", "li", "br", "div", "span", "p")

    @staticmethod
    def escape(string):
        """ Escape text for use in Android files.

        Respect tags that are allowed in  strings. Examples:
          "hello" world      => \\"hello\\" world
          <a b="c">hello</a> => <a b="c">hello</a>
          <x y="z">hello</x> => <x y=\\"z\\">hello</x>

        :param str string: string to be escaped
        :return: escaped string
        :rtype: unicode
        """
        def _escape_text(string):
            # If the string starts with an at-sign that doesn't identify
            # another string, then we need to escape it using a leading
            # backslash
            if string.startswith(u'@') and not string.startswith(u'@string/'):
                string = string.replace(u'@', u'\\@', 1)
            return string.\
                replace(DumbXml.DOUBLE_QUOTES,
                        u''.join([DumbXml.BACKSLASH, DumbXml.DOUBLE_QUOTES])).\
                replace(DumbXml.SINGLE_QUOTE,
                        u''.join([DumbXml.BACKSLASH, DumbXml.SINGLE_QUOTE]))

        return xml_escape(string, AndroidHandler.INLINE_TAGS, _escape_text)

    @staticmethod
    def unescape(string):
        # If the string starts with an escaped at-sign, do not display the
        # backslash
        if string.startswith(u'\\@'):
            string = string[1:]
        if len(string) and string[0] == string[-1] == DumbXml.DOUBLE_QUOTES:
            return string[1:-1].\
                replace(u''.join([DumbXml.BACKSLASH, DumbXml.DOUBLE_QUOTES]),
                        DumbXml.DOUBLE_QUOTES)
        else:
            return string.\
                replace(u''.join([DumbXml.BACKSLASH, DumbXml.SINGLE_QUOTE]),
                        DumbXml.SINGLE_QUOTE).\
                replace(u''.join([DumbXml.BACKSLASH, DumbXml.DOUBLE_QUOTES]),
                        DumbXml.DOUBLE_QUOTES)
Exemple #33
0
    def parse(self, content, **kwargs):
        newline_type = find_newline_type(content)
        if newline_type == 'DOS':
            content = force_newline_type(content, 'UNIX')

        # mistune expands tabs to 4 spaces and trims trailing spaces, so we
        # need to do the same in order to be able to match the substrings
        template = content.expandtabs(4)
        pattern = re.compile(ensure_unicode(r'^ +$'), re.M)
        content = pattern.sub('', template)

        stringset = []

        yml_header = re.match(
            ensure_unicode(r'^(---\s+)([\s\S]*?[^`]\s*)(\n---\s+)(?!-)'),
            content
        )
        yaml_header_content = ''
        yaml_stringset = []
        yaml_template = ''
        seperator = ''

        if yml_header:
            yaml_header_content = ''.join(yml_header.group(1, 2))
            seperator = yml_header.group(3)
            md_content = content[len(yaml_header_content + seperator):]
            yaml_template, yaml_stringset = YamlHandler().parse(
                yaml_header_content
            )
            for openstring in yaml_stringset:
                self._unescape_non_printable(openstring)
        else:
            md_content = content

        md_template = md_content

        block = TxBlockLexer()
        markdown = Markdown(block=block)

        # Making sure stringset is empty because of recursive inside `markdown`
        block.md_stringset = []

        # Command that populates block.stringset var
        markdown(md_content)

        stringset.extend(yaml_stringset)
        order = len(stringset)
        curr_pos = 0
        for string in block.md_stringset:
            string = string_handler(string, md_template)
            # Ignore any string that does not appear in the template,
            # We do this to avoid parsing strings that are not properly
            # handled by the Markdown library, such as ```code``` blocks
            if string and string in md_template[curr_pos:]:
                string_object = OpenString(six.text_type(order),
                                           string,
                                           order=order)
                order += 1
                stringset.append(string_object)
                # Keep track of the index of the last replaced hash
                md_template = (
                    md_template[:curr_pos] + md_template[curr_pos:].replace(
                        string, string_object.template_replacement, 1)
                )

                curr_pos = md_template.find(string_object.template_replacement)
                curr_pos = curr_pos + len(string_object.template_replacement)

        template = yaml_template + seperator + md_template
        return force_newline_type(template, newline_type), stringset
Exemple #34
0
    def _handle_child_pairs(self, key_tag, dict_tag):
        """Handles the <key> tag and its <dict> value tag.

        Note that in order to avoid splitting strings we perform the following
        inline-replacement:

            <key>NSStringLocalizedFormatKey</key>
            <string>Look! There %#@mouse@ there</string>
            <key>mouse</key>
            <dict>
                <key>NSStringFormatSpecTypeKey</key>
                <string>NSStringPluralRuleType</string>
                <key>NSStringFormatValueTypeKey</key>
                <string>d</string>
                <key>one</key>
                <string>is a mouse</string>
                <key>other</key>
                <string>are %d mice</string>
            </dict>

        Becomes:

            <key>NSStringLocalizedFormatKey</key>
            <string>%#@mouse@</string>
            <key>mouse</key>
            <dict>
                <key>NSStringFormatSpecTypeKey</key>
                <string>NSStringPluralRuleType</string>
                <key>NSStringFormatValueTypeKey</key>
                <string>d</string>
                <key>one</key>
                <string>Look! There is a mouse there</string>
                <key>other</key>
                <string>Look! There are %d mice there</string>
            </dict>

        This is necessary to avoid splitting sentences in Transifex, or omit
        parts of the translatable content. We reference this inline-replacement
        in the comments below as [1].

        :param key_tag: The <key> tag to be handled.
        :param dict_tag: The <dict> tag to be handled.
        :returns: A list containing the openstrings created. If no strings were
                    created the list is empty.
        """

        # The first key tag contains the main key
        main_key = self._handle_key(key_tag, main_key=True)
        dict_iterator = self._handle_dict(dict_tag)

        string_list = []
        # A helper variable to save the prefix and suffix needed for the
        # inline-replacement [1].
        text_extras = None

        for key_child in dict_iterator:
            # The second key contains the secondary key.
            secondary_key = self._handle_key(key_child)
            value_tag = self._get_key_value(dict_iterator, key_child)
            if secondary_key == self.KEY_FORMAT:
                matches = re.match(ensure_unicode(self.VALUE_CONTENT_RE),
                                   value_tag.content)
                if matches is not None:
                    # The prefix and the suffix are relative to the FIRST
                    # variable
                    text_extras = {
                        "variable": matches.group("variable"),
                        "prefix": matches.group("prefix"),
                        "suffix": matches.group("suffix"),
                    }
                # If the key is the one of the stringsdict defaults skip it
                continue

            openstring = self._handle_strings(
                value_tag,
                main_key,
                secondary_key,
                text_extras,
            )
            if openstring is not None:
                # If an openstring was created append it to the list
                string_list.append(openstring)
        return string_list