def _compile_story(self, story_content): """ Handles the compilation of a single story args: story_content: the xml content of the story returns: compiled_story: the compiled story content """ transcriber = Transcriber(story_content) hash_regex = re.compile(ensure_unicode(r'[a-z,0-9]{32}_tr')) found = True while found: try: current_string = self.stringset.pop(0) hash_position = story_content.index( current_string.template_replacement ) except ValueError: found = False self.stringset.insert(0, current_string) except IndexError: break else: transcriber.copy_until(hash_position) transcriber.add(current_string.string) transcriber.skip(len(current_string.template_replacement)) # Update the XML file to contain the template strings transcriber.copy_until(len(story_content)) compiled_story = transcriber.get_destination() # in case there are any hashes that have not been replaced, replace # them with an empty string compiled_story = hash_regex.sub(u'', compiled_story) return compiled_story
def parse(self, content, **kwargs): stringset = [] if isinstance(content, six.binary_type): content = content.decode("utf-8") # convert to unicode resources_tag_position = content.index("<resources") self.transcriber = Transcriber(content[resources_tag_position:]) source = self.transcriber.source self._order = 0 resources_tag = DumbXml(source) last_comment = "" for tag, offset in resources_tag.find( ("string-array", "string", "plurals", DumbXml.COMMENT)): if self._should_ignore(tag): last_comment = "" continue if tag.name == DumbXml.COMMENT: last_comment = tag.inner self.transcriber.copy_until(offset + len(tag.content)) elif tag.name == "string": string = self._handle_string_tag(tag, offset, last_comment) last_comment = "" if string is not None: stringset.append(string) elif tag.name == "string-array": for string in self._handle_string_array_tag( tag, offset, last_comment): if string is not None: stringset.append(string) last_comment = "" elif tag.name == "plurals": string = self._handle_plurals_tag(tag, offset, last_comment) if string is not None: stringset.append(string) last_comment = "" self.transcriber.copy_until(len(source)) template = content[:resources_tag_position] +\ self.transcriber.get_destination() self.transcriber = None return template, stringset
def compile(self, template, stringset, **kwargs): resources_tag_position = template.index("<resources") self._stringset = list(stringset) self._stringset_index = 0 self.transcriber = Transcriber(template[resources_tag_position:]) self.source = self.transcriber.source resources_tag = DumbXml(self.source) for tag, offset in resources_tag.find(("string", "string-array", "plurals")): if self._should_ignore(tag): continue if tag.name == "string": self._compile_string(tag, offset) elif tag.name == "string-array": self._compile_string_array(tag, offset) elif tag.name == "plurals": self._compile_plurals(tag, offset) self.transcriber.copy_until(len(self.source)) # Lets do another pass to clear empty <string-array>s self.transcriber = Transcriber(self.transcriber.get_destination()) self.source = self.transcriber.source resources_tag = DumbXml(self.source) for string_array_tag, string_array_offset in resources_tag.find( "string-array"): if (string_array_tag.inner and len(list(string_array_tag.find("item"))) == 0): self.transcriber.copy_until(string_array_offset) self.transcriber.skip(len(string_array_tag.content)) self.transcriber.copy_until(len(self.source)) compiled = template[:resources_tag_position] +\ self.transcriber.get_destination() self._stringset = None self._stringset_index = None self.transcriber = None return compiled
def compile(self, template, stringset, **kwargs): resources_tag_position = template.index("<resources") self._stringset = list(stringset) self._stringset_index = 0 self.transcriber = Transcriber(template[resources_tag_position:]) self.source = self.transcriber.source resources_tag = DumbXml(self.source) for tag, offset in resources_tag.find( ("string", "string-array", "plurals")): if self._should_ignore(tag): continue if tag.name == "string": self._compile_string(tag, offset) elif tag.name == "string-array": self._compile_string_array(tag, offset) elif tag.name == "plurals": self._compile_plurals(tag, offset) self.transcriber.copy_until(len(self.source)) # Lets do another pass to clear empty <string-array>s self.transcriber = Transcriber(self.transcriber.get_destination()) self.source = self.transcriber.source resources_tag = DumbXml(self.source) for string_array_tag, string_array_offset in resources_tag.find( "string-array"): if (string_array_tag.inner and len(list(string_array_tag.find("item"))) == 0): self.transcriber.copy_until(string_array_offset) self.transcriber.skip(len(string_array_tag.content)) self.transcriber.copy_until(len(self.source)) compiled = template[:resources_tag_position] +\ self.transcriber.get_destination() self._stringset = None self._stringset_index = None self.transcriber = None return compiled
def parse(self, content): self.transcriber = Transcriber(content) source = self.transcriber.source stringset = [] self.max_order = None for start, subtitle_section in self._generate_split_subtitles(source): self.transcriber.copy_until(start) offset, string = self._parse_section(start, subtitle_section) if string: stringset.append(string) self.transcriber.copy_until(offset) self.transcriber.add(string.template_replacement) self.transcriber.skip(len(string.string)) else: self.transcriber.copy_until(start + len(subtitle_section)) self.transcriber.copy_until(len(source)) template = self.transcriber.get_destination() return template, stringset
def parse(self, content, **kwargs): stringset = [] if isinstance(content, six.binary_type): content = content.decode("utf-8") # convert to unicode resources_tag_position = content.index("<resources") self.transcriber = Transcriber(content[resources_tag_position:]) source = self.transcriber.source self._order = 0 resources_tag = DumbXml(source) last_comment = "" for tag, offset in resources_tag.find(("string-array", "string", "plurals", DumbXml.COMMENT)): if self._should_ignore(tag): last_comment = "" continue if tag.name == DumbXml.COMMENT: last_comment = tag.inner self.transcriber.copy_until(offset + len(tag.content)) elif tag.name == "string": string = self._handle_string_tag(tag, offset, last_comment) last_comment = "" if string is not None: stringset.append(string) elif tag.name == "string-array": for string in self._handle_string_array_tag(tag, offset, last_comment): if string is not None: stringset.append(string) last_comment = "" elif tag.name == "plurals": string = self._handle_plurals_tag(tag, offset, last_comment) if string is not None: stringset.append(string) last_comment = "" self.transcriber.copy_until(len(source)) template = content[:resources_tag_position] +\ self.transcriber.get_destination() self.transcriber = None return template, stringset
def compile(self, template, stringset, **kwargs): # Fix regex encoding space_pattern = re.compile(ensure_unicode(self.SPACE_PAT)) # assume stringset is ordered within the template transcriber = Transcriber(template) template = transcriber.source for string in stringset: hash_position = template.index(string.template_replacement) if not string.pluralized: transcriber.copy_until(hash_position) transcriber.add(string.string) transcriber.skip(len(string.template_replacement)) else: # if the hash is on its own on a line with only spaces, we have # to remember it's indent indent_length = template[hash_position::-1].index('\n') - 1 indent = template[hash_position - indent_length:hash_position] tail_length = template[ hash_position + len(string.template_replacement): ].index('\n') tail = template[ hash_position + len(string.template_replacement): hash_position + len(string.template_replacement) + tail_length ] if (space_pattern.search(indent) and space_pattern.search(tail)): transcriber.copy_until(hash_position - indent_length) for rule, value in six.iteritems(string.string): transcriber.add( indent + self.plural_template.format( rule=self.RULES_ITOA[rule], string=value ) + tail + '\n' ) transcriber.skip(indent_length + len(string.template_replacement) + tail_length + 1) else: # string is not on its own, simply replace hash with all # plural forms transcriber.copy_until(hash_position) for rule, value in six.iteritems(string.string): transcriber.add(self.plural_template.format( rule=self.RULES_ITOA[rule], string=value )) transcriber.skip(len(string.template_replacement)) transcriber.copy_until(len(template)) compiled = transcriber.get_destination() return compiled
def compile(self, template, stringset, **kwargs): transcriber = Transcriber(template) template = transcriber.source stringset = iter(stringset) string = next(stringset) for start, subtitle_section in self.\ _generate_split_subtitles(template): transcriber.copy_until(start) transcriber.mark_section_start() # Hash is supposed to follow second newline character first_newline = subtitle_section.index('\n') second_newline = subtitle_section.index('\n', first_newline + 1) hash_position = second_newline + 1 if (subtitle_section[ hash_position: hash_position + len(string.template_replacement) ] == string.template_replacement): # found it transcriber.copy_until(start + hash_position) transcriber.add(string.string) transcriber.skip(len(string.template_replacement)) transcriber.copy_until(start + len(subtitle_section)) transcriber.mark_section_end() try: string = next(stringset) except StopIteration: pass else: # did not find it, must remove section transcriber.copy_until(start + len(subtitle_section)) transcriber.mark_section_end() transcriber.remove_section() transcriber.copy_until(len(template)) return transcriber.get_destination()
class BetaAndroidHandler(Handler): name = "BETA_ANDROID" extension = "xml" plural_template = u'<item quantity="{rule}">{string}</item>' SPACE_PAT = re.compile(r'^\s*$') # Atttibutes that designate a string should be filtered out FILTER_ATTRIBUTES = { 'translatable': 'false' } EXTRACTS_RAW = False SPECIFIER = re.compile( r'%((?:(?P<ord>\d+)\$|\((?P<key>\w+)\))?(?P<fullvar>[+#\- 0]*(?:\d+)?' r'(?:\.\d+)?(hh\|h\|l\|ll|j|z|t|L)?(?P<type>[diufFeEgGxXaAoscpn%])))' ) def parse(self, content, **kwargs): stringset = [] if isinstance(content, six.binary_type): content = content.decode("utf-8") # convert to unicode resources_tag_position = content.index("<resources") self.transcriber = Transcriber(content[resources_tag_position:]) source = self.transcriber.source self._order = 0 resources_tag = DumbXml(source) last_comment = "" for tag, offset in resources_tag.find(("string-array", "string", "plurals", DumbXml.COMMENT)): if self._should_ignore(tag): last_comment = "" continue if tag.name == DumbXml.COMMENT: last_comment = tag.inner self.transcriber.copy_until(offset + len(tag.content)) elif tag.name == "string": string = self._handle_string_tag(tag, offset, last_comment) last_comment = "" if string is not None: stringset.append(string) elif tag.name == "string-array": for string in self._handle_string_array_tag(tag, offset, last_comment): if string is not None: stringset.append(string) last_comment = "" elif tag.name == "plurals": string = self._handle_plurals_tag(tag, offset, last_comment) if string is not None: stringset.append(string) last_comment = "" self.transcriber.copy_until(len(source)) template = content[:resources_tag_position] +\ self.transcriber.get_destination() self.transcriber = None return template, stringset def _handle_string_tag(self, tag, offset, comment): string = None if tag.inner.strip() != "": context = tag.attrs.get('product', "") string = OpenString(tag.attrs['name'], tag.inner, context=context, order=self._order, developer_comment=comment) self._order += 1 # ... <string name="foo">Hello .... # ^ self.transcriber.copy_until(offset + tag.inner_offset) # ... ing name="foo">Hello world</stri... # ^ if string is not None: self.transcriber.add(string.template_replacement) self.transcriber.skip(len(tag.inner)) else: self.transcriber.copy_until(offset + tag.inner_offset + len(tag.inner)) # ...ello World</string> # ^ self.transcriber.copy_until(offset + len(tag.content)) return string def _handle_string_array_tag(self, string_array_tag, string_array_offset, comment): # ...ing-array> <item>H... # ^ self.transcriber.copy_until(string_array_offset + string_array_tag.inner_offset) context = string_array_tag.attrs.get('product', "") for index, (item_tag, item_offset) in enumerate( string_array_tag.find('item')): string = None if item_tag.inner.strip() != "": string = OpenString( "{}[{}]".format(string_array_tag.attrs['name'], index), item_tag.inner, context=context, order=self._order, developer_comment=comment ) self._order += 1 yield string # ... <item>Hello... # ^ self.transcriber.copy_until(string_array_offset + item_offset + item_tag.inner_offset) # ...ello world</item>... # ^ if string is not None: self.transcriber.add(string.template_replacement) self.transcriber.skip(len(item_tag.inner)) else: self.transcriber.copy_until(string_array_offset + item_offset + item_tag.inner_offset) # orld</item> <it... # ^ self.transcriber.copy_until( string_array_offset + item_offset + item_tag.inner_offset + len(item_tag.content) ) # </item> </string-array> # ^ self.transcriber.copy_until(string_array_offset + len(string_array_tag.content)) def _handle_plurals_tag(self, plurals_tag, plurals_offset, comment): # <plurals name="foo"> <item>Hello ... # ^ self.transcriber.copy_until(plurals_offset + plurals_tag.inner_offset) first_item_offset = None strings = {} for item_tag, item_offset in plurals_tag.find('item'): if item_tag.inner.strip() == "": strings = None break first_item_offset = first_item_offset or item_offset rule = self.get_rule_number(item_tag.attrs['quantity']) strings[rule] = item_tag.inner last_item_tag, last_item_offset = item_tag, item_offset if strings is not None: context = plurals_tag.attrs.get('product', "") string = OpenString(plurals_tag.attrs['name'], strings, pluralized=True, context=context, order=self._order, developer_comment=comment) self._order += 1 # <plurals name="foo"> <item>Hello ... # ^ self.transcriber.copy_until(plurals_offset + first_item_offset) # ...</item> </plurals>... # ^ self.transcriber.add(string.template_replacement) self.transcriber.skip(last_item_offset + len(last_item_tag.content) - first_item_offset) else: string = None # ...</plurals> ... # ^ self.transcriber.copy_until(plurals_offset + len(plurals_tag.content)) return string def _should_ignore(self, tag): """ If the tag has a key: value elemement that matches FILTER_ATTRIBUTES it will return True, else it returns False """ for key, value in six.iteritems(self.FILTER_ATTRIBUTES): filter_attr = tag.attrs.get(key, None) if filter_attr is not None and filter_attr == value: return True return False def compile(self, template, stringset, **kwargs): resources_tag_position = template.index("<resources") self._stringset = list(stringset) self._stringset_index = 0 self.transcriber = Transcriber(template[resources_tag_position:]) self.source = self.transcriber.source resources_tag = DumbXml(self.source) for tag, offset in resources_tag.find(("string", "string-array", "plurals")): if self._should_ignore(tag): continue if tag.name == "string": self._compile_string(tag, offset) elif tag.name == "string-array": self._compile_string_array(tag, offset) elif tag.name == "plurals": self._compile_plurals(tag, offset) self.transcriber.copy_until(len(self.source)) # Lets do another pass to clear empty <string-array>s self.transcriber = Transcriber(self.transcriber.get_destination()) self.source = self.transcriber.source resources_tag = DumbXml(self.source) for string_array_tag, string_array_offset in resources_tag.find( "string-array"): if (string_array_tag.inner and len(list(string_array_tag.find("item"))) == 0): self.transcriber.copy_until(string_array_offset) self.transcriber.skip(len(string_array_tag.content)) self.transcriber.copy_until(len(self.source)) compiled = template[:resources_tag_position] +\ self.transcriber.get_destination() self._stringset = None self._stringset_index = None self.transcriber = None return compiled def _compile_string(self, string_tag, string_offset): try: next_string = self._stringset[self._stringset_index] except IndexError: next_string = None if (next_string is not None and next_string.template_replacement == string_tag.inner): # found one to replace self._stringset_index += 1 self.transcriber.copy_until(string_offset + string_tag.inner_offset) self.transcriber.add(next_string.string) self.transcriber.skip(len(string_tag.inner)) self.transcriber.copy_until(string_offset + len(string_tag.content)) else: # didn't find it, must remove by skipping it self.transcriber.copy_until(string_offset) self.transcriber.skip(len(string_tag.content)) def _compile_string_array(self, string_array_tag, string_array_offset): self.transcriber.copy_until(string_array_offset + string_array_tag.inner_offset) for item_tag, item_offset in string_array_tag.find("item"): try: next_string = self._stringset[self._stringset_index] except IndexError: next_string = None if (next_string is not None and next_string.template_replacement == item_tag.inner): # found one to replace self._stringset_index += 1 self.transcriber.copy_until(string_array_offset + item_offset + item_tag.inner_offset) self.transcriber.add(next_string.string) self.transcriber.skip(len(item_tag.inner)) self.transcriber.copy_until(string_array_offset + item_offset + len(item_tag.content)) else: # didn't find it, must remove by skipping it self.transcriber.copy_until(string_array_offset + item_offset) self.transcriber.skip(len(item_tag.content)) self.transcriber.copy_until(string_array_offset + len(string_array_tag.content)) def _compile_plurals(self, plurals_tag, plurals_offset): try: next_string = self._stringset[self._stringset_index] except IndexError: next_string = None if (next_string is not None and next_string.template_replacement == plurals_tag.inner.strip()): # found one to replace, if the hash is on its own on a line with # only spaces, we have to remember it's indent self._stringset_index += 1 is_multiline = True indent_length = tail_length = 0 try: hash_position = plurals_offset + plurals_tag.inner_offset +\ plurals_tag.inner.index(next_string.template_replacement) indent_length = self.source[hash_position::-1].\ index('\n') - 1 indent = self.source[hash_position - indent_length:hash_position] end_of_hash = (hash_position + len(next_string.template_replacement)) tail_length = self.source[end_of_hash:].index('\n') tail = self.source[end_of_hash:end_of_hash + tail_length] except ValueError: is_multiline = False is_multiline = (is_multiline and (self.SPACE_PAT.search(indent) and self.SPACE_PAT.search(tail))) if is_multiline: # write until beginning of hash self.transcriber.copy_until(hash_position - indent_length) for rule, value in six.iteritems(next_string.string): self.transcriber.add( indent + self.plural_template.format( rule=self.get_rule_string(rule), string=value ) + tail + '\n' ) self.transcriber.skip(indent_length + len(next_string.template_replacement) + tail_length + 1) else: # string is not on its own, simply replace hash with all plural # forms self.transcriber.copy_until(hash_position) for rule, value in six.iteritems(next_string.string): self.transcriber.add( self.plural_template.format( rule=self.get_rule_string(rule), string=value ) ) self.transcriber.skip(indent_length + len(next_string.template_replacement) + tail_length) # finish up by copying until the end of </plurals> self.transcriber.copy_until(plurals_offset + len(plurals_tag.content)) else: # didn't find it, must remove by skipping it self.transcriber.copy_until(plurals_offset) self.transcriber.skip_until(plurals_offset + len(plurals_tag.content))
class SrtHandler(Handler): name = "SRT" extension = "srt" EXTRACTS_RAW = False NON_SPACE_PAT = re.compile(r'[^\s]') def _generate_split_subtitles(self, content, **kwargs): start = 0 for section in content.split('\n\n'): # find first non-space character of section match = self.NON_SPACE_PAT.search(section) if match: yield start + match.start(), section.strip() start += len(section) + 2 def parse(self, content): self.transcriber = Transcriber(content) source = self.transcriber.source stringset = [] self.max_order = None for start, subtitle_section in self._generate_split_subtitles(source): self.transcriber.copy_until(start) offset, string = self._parse_section(start, subtitle_section) if string: stringset.append(string) self.transcriber.copy_until(offset) self.transcriber.add(string.template_replacement) self.transcriber.skip(len(string.string)) else: self.transcriber.copy_until(start + len(subtitle_section)) self.transcriber.copy_until(len(source)) template = self.transcriber.get_destination() return template, stringset def _parse_section(self, offset, section): try: order_str, timings, string = section.split('\n', 2) except ValueError: raise ParseError( u"Not enough data on subtitle section on line {}. Order " u"number, timings and subtitle content are needed". format(self.transcriber.line_number) ) # first line, order order_parse_error = False try: order_int = int(order_str.strip()) except ValueError: order_parse_error = True else: if order_int <= 0: order_parse_error = True if order_parse_error: raise ParseError( u"Order number on line {line_no} ({order_no}) must be a " u"positive integer".format( line_no=self.transcriber.line_number, order_no=order_str, ) ) if self.max_order is not None and order_int <= self.max_order: raise ParseError( u"Order numbers must be in ascending order; number in line " u"{line_no} ({order_no}) is wrong".format( line_no=self.transcriber.line_number, order_no=order_int, ) ) else: self.max_order = order_int # second line, timings timings_parse_error = False try: splitted = timings.split(None, 3) if len(splitted) == 3: start, arrow, end = splitted else: start, arrow, end, _ = splitted except ValueError: timings_parse_error = True else: if arrow != u"-->": timings_parse_error = True if timings_parse_error: raise ParseError( u"Timings on line {} don't follow '[start] --> [end] " "(position)' pattern".format( self.transcriber.line_number + 1 ) ) try: start = self._format_timing(start) except ValueError: raise ParseError( u"Problem with start of timing at line {line_no}: '{start}'". format(line_no=self.transcriber.line_number + 1, start=start) ) try: end = self._format_timing(end) except ValueError: raise ParseError( u"Problem with end of timing at line {line_no}: '{end}'". format(line_no=self.transcriber.line_number + 1, end=end) ) # Content string_stripped = string.strip() if string_stripped == u"": raise ParseError(u"Subtitle is empty on line {}". format(self.transcriber.line_number + 2)) string = OpenString(order_str.strip(), string, order=order_int, occurrences="{},{}".format(start, end)) return offset + len(order_str) + 1 + len(timings) + 1, string def _format_timing(self, timing): try: rest, milliseconds = timing.split(',') milliseconds = "{:<03}".format(milliseconds) except ValueError: rest, milliseconds = timing, "000" hours, minutes, seconds = rest.split(':') hours, minutes, seconds, milliseconds = (int(hours), int(minutes), int(seconds), int(milliseconds)) return "{:02}:{:02}:{:02}.{:03}".format(hours, minutes, seconds, milliseconds) def compile(self, template, stringset, **kwargs): transcriber = Transcriber(template) template = transcriber.source stringset = iter(stringset) string = next(stringset) for start, subtitle_section in self.\ _generate_split_subtitles(template): transcriber.copy_until(start) transcriber.mark_section_start() # Hash is supposed to follow second newline character first_newline = subtitle_section.index('\n') second_newline = subtitle_section.index('\n', first_newline + 1) hash_position = second_newline + 1 if (subtitle_section[ hash_position: hash_position + len(string.template_replacement) ] == string.template_replacement): # found it transcriber.copy_until(start + hash_position) transcriber.add(string.string) transcriber.skip(len(string.template_replacement)) transcriber.copy_until(start + len(subtitle_section)) transcriber.mark_section_end() try: string = next(stringset) except StopIteration: pass else: # did not find it, must remove section transcriber.copy_until(start + len(subtitle_section)) transcriber.mark_section_end() transcriber.remove_section() transcriber.copy_until(len(template)) return transcriber.get_destination()
def _compile_from_template(self, template, stringset, **kwargs): """ Compiles translation file from template Iterates over the stringset and for each strings replaces template replacement in the template with the actual translation. Returns: The compiled file content. """ transcriber = Transcriber(template) template = transcriber.source for string in stringset: if string.pluralized: translation = self._compile_pluralized(string) else: translation = self._write_styled_literal(string) hash_position = template.index(string.template_replacement) transcriber.copy_until(hash_position) # The context contains custom tags. If it exists, we must prepend # it and apply a space afterwards so it doesn't get merged with the # string if string.context: transcriber.add(string.context) transcriber.add(' ') transcriber.add(translation) transcriber.skip(len(string.template_replacement)) transcriber.copy_until(len(template)) compiled = transcriber.get_destination() return compiled
class SrtHandler(Handler): name = "SRT" extension = "srt" NON_SPACE_PAT = re.compile(r'[^\s]') def _generate_split_subtitles(self, content, **kwargs): start = 0 for section in content.split('\n\n'): # find first non-space character of section match = self.NON_SPACE_PAT.search(section) if match: yield start + match.start(), section.strip() start += len(section) + 2 def parse(self, content): self.transcriber = Transcriber(content) source = self.transcriber.source stringset = [] self.max_order = None for start, subtitle_section in self._generate_split_subtitles(source): self.transcriber.copy_until(start) offset, string = self._parse_section(start, subtitle_section) if string: stringset.append(string) self.transcriber.copy_until(offset) self.transcriber.add(string.template_replacement) self.transcriber.skip(len(string.string)) else: self.transcriber.copy_until(start + len(subtitle_section)) self.transcriber.copy_until(len(source)) template = self.transcriber.get_destination() return template, stringset def _parse_section(self, offset, section): try: order_str, timings, string = section.split('\n', 2) except ValueError: raise ParseError( u"Not enough data on subtitle section on line {}. Order " u"number, timings and subtitle content are needed". format(self.transcriber.line_number) ) # first line, order order_parse_error = False try: order_int = int(order_str.strip()) except ValueError: order_parse_error = True else: if order_int <= 0: order_parse_error = True if order_parse_error: raise ParseError( u"Order number on line {line_no} ({order_no}) must be a " u"positive integer".format( line_no=self.transcriber.line_number, order_no=order_str, ) ) if self.max_order is not None and order_int <= self.max_order: raise ParseError( u"Order numbers must be in ascending order; number in line " u"{line_no} ({order_no}) is wrong".format( line_no=self.transcriber.line_number, order_no=order_int, ) ) else: self.max_order = order_int # second line, timings timings_parse_error = False try: splitted = timings.split(None, 3) if len(splitted) == 3: start, arrow, end = splitted else: start, arrow, end, _ = splitted except ValueError: timings_parse_error = True else: if arrow != u"-->": timings_parse_error = True if timings_parse_error: raise ParseError( u"Timings on line {} don't follow '[start] --> [end] " "(position)' pattern".format( self.transcriber.line_number + 1 ) ) try: start = self._format_timing(start) except ValueError: raise ParseError( u"Problem with start of timing at line {line_no}: '{start}'". format(line_no=self.transcriber.line_number + 1, start=start) ) try: end = self._format_timing(end) except ValueError: raise ParseError( u"Problem with end of timing at line {line_no}: '{end}'". format(line_no=self.transcriber.line_number + 1, end=end) ) # Content string_stripped = string.strip() if string_stripped == u"": raise ParseError(u"Subtitle is empty on line {}". format(self.transcriber.line_number + 2)) string = OpenString(order_str.strip(), string, order=order_int, occurrences="{},{}".format(start, end)) return offset + len(order_str) + 1 + len(timings) + 1, string def _format_timing(self, timing): try: rest, milliseconds = timing.split(',') milliseconds = "{:<03}".format(milliseconds) except ValueError: rest, milliseconds = timing, "000" hours, minutes, seconds = rest.split(':') hours, minutes, seconds, milliseconds = (int(hours), int(minutes), int(seconds), int(milliseconds)) return "{:02}:{:02}:{:02}.{:03}".format(hours, minutes, seconds, milliseconds) def compile(self, template, stringset, **kwargs): transcriber = Transcriber(template) template = transcriber.source stringset = iter(stringset) string = next(stringset) for start, subtitle_section in self.\ _generate_split_subtitles(template): transcriber.copy_until(start) transcriber.mark_section_start() # Hash is supposed to follow second newline character first_newline = subtitle_section.index('\n') second_newline = subtitle_section.index('\n', first_newline + 1) hash_position = second_newline + 1 if (subtitle_section[ hash_position: hash_position + len(string.template_replacement) ] == string.template_replacement): # found it transcriber.copy_until(start + hash_position) transcriber.add(string.string) transcriber.skip(len(string.template_replacement)) transcriber.copy_until(start + len(subtitle_section)) transcriber.mark_section_end() try: string = next(stringset) except StopIteration: pass else: # did not find it, must remove section transcriber.copy_until(start + len(subtitle_section)) transcriber.mark_section_end() transcriber.remove_section() transcriber.copy_until(len(template)) return transcriber.get_destination()
def _escape_lt(string): """Escape `<` character (<). If a valid XML escape sequence is found, it is left as it is. Otherwise, any occurrences of `<` are replaced with `<`. E.g.: "hello world" -> "hello world" "hello <world" -> "hello <world" "hello <world" -> "hello <world" """ # Find "lonely" `<` positions by finding all `<` positions # and subtracting the positions of `<` that are part of # valid XML escape sequences (based on # https://mayart.de/download/Indesign-IDML/special-idml-chars.pdf) all_lt_positions = { match.span()[0] for match in re.finditer(r'<', string) } escaped_lt_positions = { match.span()[0] for match in re.finditer( r'<(\?ACE 18\?|\?ACE 19\?|\?ACE 3\?|\?ACE 8\?|\?ACE 7\?|Br\/)>', string) } target_positions = sorted(all_lt_positions - escaped_lt_positions) # Use Transcriber to replace lonely ampersands with '&' transcriber = Transcriber(string) for position in target_positions: transcriber.copy_until(position) transcriber.add('<') transcriber.skip(1) transcriber.copy_to_end() return transcriber.get_destination()
def _compile_story(self, story_content): """ Handles the compilation of a single story args: story_content: the xml content of the story returns: compiled_story: the compiled story content """ transcriber = Transcriber(story_content) hash_regex = re.compile(ensure_unicode(r'[a-z,0-9]{32}_tr')) found = True while found: try: current_string = self.stringset.pop(0) hash_position = story_content.index( current_string.template_replacement) except ValueError: found = False self.stringset.insert(0, current_string) except IndexError: break else: transcriber.copy_until(hash_position) transcriber.add(self._escape_amps(current_string.string)) transcriber.skip(len(current_string.template_replacement)) # Update the XML file to contain the template strings transcriber.copy_until(len(story_content)) compiled_story = transcriber.get_destination() # in case there are any hashes that have not been replaced, replace # them with an empty string compiled_story = hash_regex.sub(u'', compiled_story) return compiled_story
def compile(self, template, stringset, **kwargs): # Fix regex encoding space_pattern = re.compile(ensure_unicode(self.SPACE_PAT)) # assume stringset is ordered within the template transcriber = Transcriber(template) template = transcriber.source for string in stringset: hash_position = template.index(string.template_replacement) if not string.pluralized: transcriber.copy_until(hash_position) transcriber.add(string.string) transcriber.skip(len(string.template_replacement)) else: # if the hash is on its own on a line with only spaces, we have # to remember it's indent indent_length = template[hash_position::-1].index('\n') - 1 indent = template[hash_position - indent_length:hash_position] tail_length = template[hash_position + len(string.template_replacement ):].index('\n') tail = template[hash_position + len(string.template_replacement ):hash_position + len(string.template_replacement) + tail_length] if (space_pattern.search(indent) and space_pattern.search(tail)): transcriber.copy_until(hash_position - indent_length) for rule, value in six.iteritems(string.string): transcriber.add(indent + self.plural_template.format( rule=self.RULES_ITOA[rule], string=value) + tail + '\n') transcriber.skip(indent_length + len(string.template_replacement) + tail_length + 1) else: # string is not on its own, simply replace hash with all # plural forms transcriber.copy_until(hash_position) for rule, value in six.iteritems(string.string): transcriber.add( self.plural_template.format( rule=self.RULES_ITOA[rule], string=value)) transcriber.skip(len(string.template_replacement)) transcriber.copy_until(len(template)) compiled = transcriber.get_destination() return compiled
def _escape_amps(string): """ Escape "lonely" `&` (ampersands). If a valid XML escape sequence is found, it is left as it is. Otherwise, any occurrences of `&` are replaced with `&`. Eg, "hello world" -> "hello world" "hello &world" -> "hello &world" "hello &world" -> "hello &world" "hello <world" -> "hello <world" "hello ਟworld" -> "hello ਟworld" "&֯&&" -> "&֯&&" """ # Find "lonely" ampersand positions by finding all ampersand positions # and subtracting the positions of ampersands that are part of valid # XML escape sequences all_amp_positions = { match.span()[0] for match in re.finditer(r'&', string) } escaped_amp_positions = { match.span()[0] for match in re.finditer( r'&(lt|gt|amp|apos|quot|#\d+|#x[0-9a-fA-F]+);', string) } target_positions = sorted(all_amp_positions - escaped_amp_positions) # Use Transcriber to replace lonely ampersands with '&' transcriber = Transcriber(string) for position in target_positions: transcriber.copy_until(position) transcriber.add('&') transcriber.skip(1) transcriber.copy_to_end() return transcriber.get_destination()
class BetaAndroidHandler(Handler): name = "BETA_ANDROID" extension = "xml" plural_template = u'<item quantity="{rule}">{string}</item>' SPACE_PAT = re.compile(r'^\s*$') # Atttibutes that designate a string should be filtered out FILTER_ATTRIBUTES = {'translatable': 'false'} EXTRACTS_RAW = False SPECIFIER = re.compile( r'%((?:(?P<ord>\d+)\$|\((?P<key>\w+)\))?(?P<fullvar>[+#\- 0]*(?:\d+)?' r'(?:\.\d+)?(hh\|h\|l\|ll|j|z|t|L)?(?P<type>[diufFeEgGxXaAoscpn%])))') def parse(self, content, **kwargs): stringset = [] if isinstance(content, six.binary_type): content = content.decode("utf-8") # convert to unicode resources_tag_position = content.index("<resources") self.transcriber = Transcriber(content[resources_tag_position:]) source = self.transcriber.source self._order = 0 resources_tag = DumbXml(source) last_comment = "" for tag, offset in resources_tag.find( ("string-array", "string", "plurals", DumbXml.COMMENT)): if self._should_ignore(tag): last_comment = "" continue if tag.name == DumbXml.COMMENT: last_comment = tag.inner self.transcriber.copy_until(offset + len(tag.content)) elif tag.name == "string": string = self._handle_string_tag(tag, offset, last_comment) last_comment = "" if string is not None: stringset.append(string) elif tag.name == "string-array": for string in self._handle_string_array_tag( tag, offset, last_comment): if string is not None: stringset.append(string) last_comment = "" elif tag.name == "plurals": string = self._handle_plurals_tag(tag, offset, last_comment) if string is not None: stringset.append(string) last_comment = "" self.transcriber.copy_until(len(source)) template = content[:resources_tag_position] +\ self.transcriber.get_destination() self.transcriber = None return template, stringset def _handle_string_tag(self, tag, offset, comment): string = None if tag.inner.strip() != "": context = tag.attrs.get('product', "") string = OpenString(tag.attrs['name'], tag.inner, context=context, order=self._order, developer_comment=comment) self._order += 1 # ... <string name="foo">Hello .... # ^ self.transcriber.copy_until(offset + tag.inner_offset) # ... ing name="foo">Hello world</stri... # ^ if string is not None: self.transcriber.add(string.template_replacement) self.transcriber.skip(len(tag.inner)) else: self.transcriber.copy_until(offset + tag.inner_offset + len(tag.inner)) # ...ello World</string> # ^ self.transcriber.copy_until(offset + len(tag.content)) return string def _handle_string_array_tag(self, string_array_tag, string_array_offset, comment): # ...ing-array> <item>H... # ^ self.transcriber.copy_until(string_array_offset + string_array_tag.inner_offset) context = string_array_tag.attrs.get('product', "") for index, (item_tag, item_offset) in enumerate(string_array_tag.find('item')): string = None if item_tag.inner.strip() != "": string = OpenString("{}[{}]".format( string_array_tag.attrs['name'], index), item_tag.inner, context=context, order=self._order, developer_comment=comment) self._order += 1 yield string # ... <item>Hello... # ^ self.transcriber.copy_until(string_array_offset + item_offset + item_tag.inner_offset) # ...ello world</item>... # ^ if string is not None: self.transcriber.add(string.template_replacement) self.transcriber.skip(len(item_tag.inner)) else: self.transcriber.copy_until(string_array_offset + item_offset + item_tag.inner_offset) # orld</item> <it... # ^ self.transcriber.copy_until(string_array_offset + item_offset + item_tag.inner_offset + len(item_tag.content)) # </item> </string-array> # ^ self.transcriber.copy_until(string_array_offset + len(string_array_tag.content)) def _handle_plurals_tag(self, plurals_tag, plurals_offset, comment): # <plurals name="foo"> <item>Hello ... # ^ self.transcriber.copy_until(plurals_offset + plurals_tag.inner_offset) first_item_offset = None strings = {} for item_tag, item_offset in plurals_tag.find('item'): if item_tag.inner.strip() == "": strings = None break first_item_offset = first_item_offset or item_offset rule = self.get_rule_number(item_tag.attrs['quantity']) strings[rule] = item_tag.inner last_item_tag, last_item_offset = item_tag, item_offset if strings is not None: context = plurals_tag.attrs.get('product', "") string = OpenString(plurals_tag.attrs['name'], strings, pluralized=True, context=context, order=self._order, developer_comment=comment) self._order += 1 # <plurals name="foo"> <item>Hello ... # ^ self.transcriber.copy_until(plurals_offset + first_item_offset) # ...</item> </plurals>... # ^ self.transcriber.add(string.template_replacement) self.transcriber.skip(last_item_offset + len(last_item_tag.content) - first_item_offset) else: string = None # ...</plurals> ... # ^ self.transcriber.copy_until(plurals_offset + len(plurals_tag.content)) return string def _should_ignore(self, tag): """ If the tag has a key: value elemement that matches FILTER_ATTRIBUTES it will return True, else it returns False """ for key, value in six.iteritems(self.FILTER_ATTRIBUTES): filter_attr = tag.attrs.get(key, None) if filter_attr is not None and filter_attr == value: return True return False def compile(self, template, stringset, **kwargs): resources_tag_position = template.index("<resources") self._stringset = list(stringset) self._stringset_index = 0 self.transcriber = Transcriber(template[resources_tag_position:]) self.source = self.transcriber.source resources_tag = DumbXml(self.source) for tag, offset in resources_tag.find( ("string", "string-array", "plurals")): if self._should_ignore(tag): continue if tag.name == "string": self._compile_string(tag, offset) elif tag.name == "string-array": self._compile_string_array(tag, offset) elif tag.name == "plurals": self._compile_plurals(tag, offset) self.transcriber.copy_until(len(self.source)) # Lets do another pass to clear empty <string-array>s self.transcriber = Transcriber(self.transcriber.get_destination()) self.source = self.transcriber.source resources_tag = DumbXml(self.source) for string_array_tag, string_array_offset in resources_tag.find( "string-array"): if (string_array_tag.inner and len(list(string_array_tag.find("item"))) == 0): self.transcriber.copy_until(string_array_offset) self.transcriber.skip(len(string_array_tag.content)) self.transcriber.copy_until(len(self.source)) compiled = template[:resources_tag_position] +\ self.transcriber.get_destination() self._stringset = None self._stringset_index = None self.transcriber = None return compiled def _compile_string(self, string_tag, string_offset): try: next_string = self._stringset[self._stringset_index] except IndexError: next_string = None if (next_string is not None and next_string.template_replacement == string_tag.inner): # found one to replace self._stringset_index += 1 self.transcriber.copy_until(string_offset + string_tag.inner_offset) self.transcriber.add(next_string.string) self.transcriber.skip(len(string_tag.inner)) self.transcriber.copy_until(string_offset + len(string_tag.content)) else: # didn't find it, must remove by skipping it self.transcriber.copy_until(string_offset) self.transcriber.skip(len(string_tag.content)) def _compile_string_array(self, string_array_tag, string_array_offset): self.transcriber.copy_until(string_array_offset + string_array_tag.inner_offset) for item_tag, item_offset in string_array_tag.find("item"): try: next_string = self._stringset[self._stringset_index] except IndexError: next_string = None if (next_string is not None and next_string.template_replacement == item_tag.inner): # found one to replace self._stringset_index += 1 self.transcriber.copy_until(string_array_offset + item_offset + item_tag.inner_offset) self.transcriber.add(next_string.string) self.transcriber.skip(len(item_tag.inner)) self.transcriber.copy_until(string_array_offset + item_offset + len(item_tag.content)) else: # didn't find it, must remove by skipping it self.transcriber.copy_until(string_array_offset + item_offset) self.transcriber.skip(len(item_tag.content)) self.transcriber.copy_until(string_array_offset + len(string_array_tag.content)) def _compile_plurals(self, plurals_tag, plurals_offset): try: next_string = self._stringset[self._stringset_index] except IndexError: next_string = None if (next_string is not None and next_string.template_replacement == plurals_tag.inner.strip()): # found one to replace, if the hash is on its own on a line with # only spaces, we have to remember it's indent self._stringset_index += 1 is_multiline = True indent_length = tail_length = 0 try: hash_position = plurals_offset + plurals_tag.inner_offset +\ plurals_tag.inner.index(next_string.template_replacement) indent_length = self.source[hash_position::-1].\ index('\n') - 1 indent = self.source[hash_position - indent_length:hash_position] end_of_hash = (hash_position + len(next_string.template_replacement)) tail_length = self.source[end_of_hash:].index('\n') tail = self.source[end_of_hash:end_of_hash + tail_length] except ValueError: is_multiline = False is_multiline = (is_multiline and (self.SPACE_PAT.search(indent) and self.SPACE_PAT.search(tail))) if is_multiline: # write until beginning of hash self.transcriber.copy_until(hash_position - indent_length) for rule, value in six.iteritems(next_string.string): self.transcriber.add(indent + self.plural_template.format( rule=self.get_rule_string(rule), string=value) + tail + '\n') self.transcriber.skip(indent_length + len(next_string.template_replacement) + tail_length + 1) else: # string is not on its own, simply replace hash with all plural # forms self.transcriber.copy_until(hash_position) for rule, value in six.iteritems(next_string.string): self.transcriber.add( self.plural_template.format( rule=self.get_rule_string(rule), string=value)) self.transcriber.skip(indent_length + len(next_string.template_replacement) + tail_length) # finish up by copying until the end of </plurals> self.transcriber.copy_until(plurals_offset + len(plurals_tag.content)) else: # didn't find it, must remove by skipping it self.transcriber.copy_until(plurals_offset) self.transcriber.skip_until(plurals_offset + len(plurals_tag.content))