def test_template_importMessage_updates_file_references(self):
     # Importing a template message updates the filereferences on an
     # existing POTMsgSet.
     template = self.factory.makePOTemplate()
     potmsgset = self.factory.makePOTMsgSet(potemplate=template)
     old_file_references = self.factory.getUniqueString()
     new_file_references = self.factory.getUniqueString()
     potmsgset.filereferences = old_file_references
     message = TranslationMessageData()
     message.msgid_singular = potmsgset.singular_text
     message.file_references = new_file_references
     queue_entry = FakeImportQueueEntry(template)
     importer = POTFileImporter(queue_entry, FakeParser(), DevNullLogger())
     importer.importMessage(message)
     self.assertEqual(new_file_references, potmsgset.filereferences)
    def _makeExportedHeader(self, translation_file):
        """Create a standard gettext PO header, encoded as a message.

        :return: The header message as a unicode string.
        """
        header_translation_message = TranslationMessageData()
        header_translation_message.addTranslation(
            TranslationConstants.SINGULAR_FORM,
            translation_file.header.getRawContent())
        header_translation_message.comment = (translation_file.header.comment)
        if translation_file.is_template:
            header_translation_message.flags.update(['fuzzy'])
        exported_header = self.exportTranslationMessageData(
            header_translation_message)
        return exported_header
Exemple #3
0
 def test_template_importMessage_updates_file_references(self):
     # Importing a template message updates the filereferences on an
     # existing POTMsgSet.
     template = self.factory.makePOTemplate()
     potmsgset = self.factory.makePOTMsgSet(potemplate=template)
     old_file_references = self.factory.getUniqueString()
     new_file_references = self.factory.getUniqueString()
     potmsgset.filereferences = old_file_references
     message = TranslationMessageData()
     message.msgid_singular = potmsgset.singular_text
     message.file_references = new_file_references
     queue_entry = FakeImportQueueEntry(template)
     importer = POTFileImporter(queue_entry, FakeParser(), DevNullLogger())
     importer.importMessage(message)
     self.assertEqual(new_file_references, potmsgset.filereferences)
 def test_translation_importMessage_does_not_update_file_references(self):
     # Importing a translation message does not update the
     # filereferences on an existing POTMsgSet.  (It used to, which
     # is what caused bug 715854).
     pofile = self.factory.makePOFile()
     potmsgset = self.factory.makePOTMsgSet(potemplate=pofile.potemplate)
     old_file_references = self.factory.getUniqueString()
     new_file_references = self.factory.getUniqueString()
     potmsgset.filereferences = old_file_references
     message = TranslationMessageData()
     message.msgid_singular = potmsgset.singular_text
     message.file_references = new_file_references
     queue_entry = FakeImportQueueEntry(pofile.potemplate, pofile)
     importer = POFileImporter(queue_entry, FakeParser(), DevNullLogger())
     importer.importMessage(message)
     self.assertEqual(old_file_references, potmsgset.filereferences)
    def _makeExportedHeader(self, translation_file):
        """Create a standard gettext PO header, encoded as a message.

        :return: The header message as a unicode string.
        """
        header_translation_message = TranslationMessageData()
        header_translation_message.addTranslation(
            TranslationConstants.SINGULAR_FORM,
            translation_file.header.getRawContent())
        header_translation_message.comment = (
            translation_file.header.comment)
        if translation_file.is_template:
            header_translation_message.flags.update(['fuzzy'])
        exported_header = self.exportTranslationMessageData(
            header_translation_message)
        return exported_header
Exemple #6
0
 def test_translation_importMessage_does_not_update_file_references(self):
     # Importing a translation message does not update the
     # filereferences on an existing POTMsgSet.  (It used to, which
     # is what caused bug 715854).
     pofile = self.factory.makePOFile()
     potmsgset = self.factory.makePOTMsgSet(potemplate=pofile.potemplate)
     old_file_references = self.factory.getUniqueString()
     new_file_references = self.factory.getUniqueString()
     potmsgset.filereferences = old_file_references
     message = TranslationMessageData()
     message.msgid_singular = potmsgset.singular_text
     message.file_references = new_file_references
     queue_entry = FakeImportQueueEntry(pofile.potemplate, pofile)
     importer = POFileImporter(queue_entry, FakeParser(), DevNullLogger())
     importer.importMessage(message)
     self.assertEqual(old_file_references, potmsgset.filereferences)
Exemple #7
0
 def test_export_message(self):
     # The MO exporter does not support export of individual
     # messages.
     exporter = GettextMOExporter()
     self.assertRaises(
         NotImplementedError,
         exporter.exportTranslationMessageData,
         TranslationMessageData())
    def _test_storeTranslationsInDatabase_empty(self, by_maintainer=True):
        """Check whether we store empty messages appropriately."""
        # Construct a POFile importer.
        pot_importer = self._createPOTFileImporter(
            TEST_TEMPLATE_EXPORTED, by_maintainer=True)
        importer = self._createPOFileImporter(
            pot_importer, TEST_TRANSLATION_EXPORTED,
            by_maintainer=by_maintainer, person=self.importer_person)

        # Empty message to import.
        message = TranslationMessageData()
        message.addTranslation(0, u'')

        potmsgset = self.factory.makePOTMsgSet(
            potemplate=importer.potemplate, sequence=50)
        translation = importer.storeTranslationsInDatabase(
            message, potmsgset)
        # No TranslationMessage is created.
        self.assertIs(None, translation)
Exemple #9
0
    def new_general_entity(self, name, value):
        """See `xmldtd.WFCDTD`."""
        if not self.started:
            return

        message = TranslationMessageData()
        message.msgid_singular = name
        # CarlosPerelloMarin 20070326: xmldtd parser does an inline
        # parsing which means that the content is all in a single line so we
        # don't have a way to show the line number with the source reference.
        message.file_references_list = ["%s(%s)" % (self.filename, name)]
        message.addTranslation(TranslationConstants.SINGULAR_FORM, value)
        message.singular_text = value
        message.context = self.chrome_path
        message.source_comment = self.last_comment
        self.messages.append(message)
        self.started += 1
        self.last_comment = None
 def getTranslationMessageData(self, translationmessage):
     # Convert a TranslationMessage to TranslationMessageData object,
     # which is used during import.
     potmsgset = translationmessage.potmsgset
     message_data = TranslationMessageData()
     message_data.context = potmsgset.context
     message_data.msgid_singular = potmsgset.singular_text
     message_data.msgid_plural = potmsgset.plural_text
     translations = translationmessage.translations
     for plural_form, translation in enumerate(translations):
         message_data.addTranslation(plural_form, translation)
     return message_data
    def new_general_entity(self, name, value):
        """See `xmldtd.WFCDTD`."""
        if not self.started:
            return

        message = TranslationMessageData()
        message.msgid_singular = name
        # CarlosPerelloMarin 20070326: xmldtd parser does an inline
        # parsing which means that the content is all in a single line so we
        # don't have a way to show the line number with the source reference.
        message.file_references_list = ["%s(%s)" % (self.filename, name)]
        message.addTranslation(TranslationConstants.SINGULAR_FORM, value)
        message.singular_text = value
        message.context = self.chrome_path
        message.source_comment = self.last_comment
        self.messages.append(message)
        self.started += 1
        self.last_comment = None
 def test_duplicateTranslationError(self):
     # Providing multiple translations for the same form raises a
     # sensible error message.
     data = TranslationMessageData()
     data.addTranslation(0, 'singular')
     try:
         data.addTranslation(0, 'ralugnis')
     except TranslationFormatSyntaxError as error:
         self.assertEqual(
             error.represent("(Default text, should not be returned.)"),
             "Message has more than one translation for plural form 0.")
 def getTranslationMessageData(self, translationmessage):
     # Convert a TranslationMessage to TranslationMessageData object,
     # which is used during import.
     potmsgset = translationmessage.potmsgset
     message_data = TranslationMessageData()
     message_data.context = potmsgset.context
     message_data.msgid_singular = potmsgset.singular_text
     message_data.msgid_plural = potmsgset.plural_text
     translations = translationmessage.translations
     for plural_form, translation in enumerate(translations):
         message_data.addTranslation(plural_form, translation)
     return message_data
    def test_comments_text_representation_multiline(self):
        # Comments with newlines should be correctly exported.
        data = TranslationMessageData()
        data.comment = "Line One\nLine Two"
        self.assertEqual("#Line One\n#Line Two",
                         comments_text_representation(data))

        # It works the same when there's a final newline as well.
        data.comment = "Line One\nLine Two\n"
        self.assertEqual("#Line One\n#Line Two",
                         comments_text_representation(data))

        # And similar processing happens for source comments.
        data = TranslationMessageData()
        data.source_comment = "Line One\nLine Two"
        self.assertEqual("#. Line One\n#. Line Two",
                         comments_text_representation(data))

        # It works the same when there's a final newline as well.
        data.source_comment = "Line One\nLine Two\n"
        self.assertEqual("#. Line One\n#. Line Two",
                         comments_text_representation(data))
    def test_comments_text_representation_multiline(self):
        # Comments with newlines should be correctly exported.
        data = TranslationMessageData()
        data.comment = "Line One\nLine Two"
        self.assertEqual("#Line One\n#Line Two", comments_text_representation(data))

        # It works the same when there's a final newline as well.
        data.comment = "Line One\nLine Two\n"
        self.assertEqual("#Line One\n#Line Two", comments_text_representation(data))

        # And similar processing happens for source comments.
        data = TranslationMessageData()
        data.source_comment = "Line One\nLine Two"
        self.assertEqual("#. Line One\n#. Line Two", comments_text_representation(data))

        # It works the same when there's a final newline as well.
        data.source_comment = "Line One\nLine Two\n"
        self.assertEqual("#. Line One\n#. Line Two", comments_text_representation(data))
    def parse(self, content):
        """Parse given content as a property file.

        Once the parse is done, self.messages has a list of the available
        `ITranslationMessageData`s.
        """

        # .properties files are supposed to be unicode-escaped, but we know
        # that there are some .xpi language packs that instead, use UTF-8.
        # That's against the specification, but Mozilla applications accept
        # it anyway, so we try to support it too.
        # To do this support, we read the text as being in UTF-8
        # because unicode-escaped looks like ASCII files.
        try:
            content = content.decode('utf-8')
        except UnicodeDecodeError:
            raise TranslationFormatInvalidInputError(
                'Content is not valid unicode-escaped text')

        line_num = 0
        is_multi_line_comment = False
        last_comment = None
        last_comment_line_num = 0
        ignore_comment = False
        is_message = False
        translation = u''
        for line in content.splitlines():
            # Now, to "normalize" all to the same encoding, we encode to
            # unicode-escape first, and then decode it to unicode
            # XXX: Danilo 2006-08-01: we _might_ get performance
            # improvements if we reimplement this to work directly,
            # though, it will be hard to beat C-based de/encoder.
            # This call unescapes everything so we don't need to care about
            # quotes escaping.
            try:
                string = line.encode('raw-unicode_escape')
                line = string.decode('unicode_escape')
            except UnicodeDecodeError as exception:
                raise TranslationFormatInvalidInputError(
                    filename=self.filename,
                    line_number=line_num,
                    message=str(exception))

            line_num += 1
            if not is_multi_line_comment:
                # Remove any white space before the useful data, like
                # ' # foo'.
                line = line.lstrip()
                if len(line) == 0:
                    # It's an empty line. Reset any previous comment we have.
                    last_comment = None
                    last_comment_line_num = 0
                    ignore_comment = False
                elif line.startswith(u'#') or line.startswith(u'//'):
                    # It's a whole line comment.
                    ignore_comment = False
                    line = line[1:].strip()
                    if last_comment:
                        last_comment += line
                    elif len(line) > 0:
                        last_comment = line

                    if last_comment and not last_comment.endswith('\n'):
                        # Comments must end always with a new line.
                        last_comment += '\n'

                    last_comment_line_num = line_num
                    continue

            # Unescaped URLs are a common mistake: the "//" starts an
            # end-of-line comment.  To work around that, treat "://" as
            # a special case.
            just_saw_colon = False

            while line:
                if is_multi_line_comment:
                    if line.startswith(u'*/'):
                        # The comment ended, we jump the closing tag and
                        # continue with the parsing.
                        line = line[2:]
                        is_multi_line_comment = False
                        last_comment_line_num = line_num
                        if ignore_comment:
                            last_comment = None
                            ignore_comment = False

                        # Comments must end always with a new line.
                        last_comment += '\n'
                    elif line.startswith(self.license_block_text):
                        # It's a comment with a licence notice, this
                        # comment can be ignored.
                        ignore_comment = True
                        # Jump the whole tag
                        line = line[len(self.license_block_text):]
                    else:
                        # Store the character.
                        if last_comment is None:
                            last_comment = line[0]
                        elif last_comment_line_num == line_num:
                            last_comment += line[0]
                        else:
                            last_comment = u'%s\n%s' % (last_comment, line[0])
                            last_comment_line_num = line_num
                        # Jump the processed char.
                        line = line[1:]
                    continue
                elif line.startswith(u'/*'):
                    # It's a multi line comment
                    is_multi_line_comment = True
                    ignore_comment = False
                    last_comment_line_num = line_num
                    # Jump the comment starting tag
                    line = line[2:]
                    continue
                elif line.startswith(u'//') and not just_saw_colon:
                    # End-of-line comment.
                    last_comment = '%s\n' % line[2:].strip()
                    last_comment_line_num = line_num
                    # On to next line.
                    break
                elif is_message:
                    # Store the char and continue.
                    head_char = line[0]
                    translation += head_char
                    line = line[1:]
                    just_saw_colon = (head_char == ':')
                    continue
                elif u'=' in line:
                    # Looks like a message string.
                    (key, value) = line.split('=', 1)
                    # Remove leading and trailing white spaces.
                    key = key.strip()

                    if valid_property_msgid(key):
                        is_message = True
                        # Jump the msgid, control chars and leading white
                        # space.
                        line = value.lstrip()
                        continue
                    else:
                        raise TranslationFormatSyntaxError(
                            line_number=line_num,
                            message=u"invalid msgid: '%s'" % key)
                else:
                    # Got a line that is not a valid message nor a valid
                    # comment. Ignore it because main en-US.xpi catalog from
                    # Firefox has such line/error. We follow the 'be strict
                    # with what you export, be permisive with what you import'
                    # policy.
                    break
            if is_message:
                # We just parsed a message, so we need to add it to the list
                # of messages.
                if ignore_comment or last_comment_line_num < line_num - 1:
                    # We must ignore the comment or either the comment is not
                    # the last thing before this message or is not in the same
                    # line as this message.
                    last_comment = None
                    ignore_comment = False

                message = TranslationMessageData()
                message.msgid_singular = key
                message.context = self.chrome_path
                message.file_references_list = [
                    "%s:%d(%s)" % (self.filename, line_num, key)
                ]
                value = translation.strip()
                message.addTranslation(TranslationConstants.SINGULAR_FORM,
                                       value)
                message.singular_text = value
                message.source_comment = last_comment
                self.messages.append(message)

                # Reset status vars.
                last_comment = None
                last_comment_line_num = 0
                is_message = False
                translation = u''
 def test_addTranslation0(self):
     # Standard use case: add a form-0 translation.
     data = TranslationMessageData()
     data.addTranslation(0, 'singular')
     self.assertEqual(data.translations, ['singular'])
 def test_addTranslation1(self):
     # Unusual but possible: translate a higher form but not form 0.
     data = TranslationMessageData()
     data.addTranslation(1, 'plural')
     self.assertEqual(data.translations, [None, 'plural'])
Exemple #19
0
    def _parseFreshLine(self, line, original_line):
        """Parse a new line (not a continuation after escaped newline).

        :param line: Remaining part of input line.
        :param original_line: Line as it originally was on input.
        :return: If there is one, the first line of a quoted string belonging
            to the line's section.  Otherwise, None.
        """
        is_obsolete = False
        if line.startswith('#~'):
            if line.startswith('#~|'):
                # This is an old msgid for an obsolete message.
                return None
            else:
                is_obsolete = True
                line = line[2:].lstrip()
                if len(line) == 0:
                    return None

        # If we get a comment line after a msgstr or a line starting with
        # msgid or msgctxt, this is a new entry.
        if ((line.startswith('#') or line.startswith('msgid')
             or line.startswith('msgctxt')) and self._section == 'msgstr'):
            if self._message is None:
                # first entry - do nothing.
                pass
            elif self._message.msgid_singular:
                self._dumpCurrentSection()
                self._storeCurrentMessage()
            elif self._translation_file.header is None:
                # When there is no msgid in the parsed message, it's the
                # header for this file.
                self._dumpCurrentSection()
                self._parseHeader(
                    self._message.translations[
                        TranslationConstants.SINGULAR_FORM],
                    self._message.comment)
            else:
                self._emitSyntaxWarning("We got a second header.")

            # Start a new message.
            self._message = TranslationMessageData()
            self._message_lineno = self._lineno
            self._section = None
            self._plural_case = None
            self._parsed_content = u''

        if self._message is not None:
            # Record whether the message is obsolete.
            self._message.is_obsolete = is_obsolete

        if line[0] == '#':
            # Record flags
            if line[:2] == '#,':
                new_flags = [flag.strip() for flag in line[2:].split(',')]
                self._message.flags.update(new_flags)
                return None
            # Record file references
            if line[:2] == '#:':
                if self._message.file_references:
                    # There is already a file reference, let's split it from
                    # the new one with a new line char.
                    self._message.file_references += '\n'
                self._message.file_references += line[2:].strip()
                return None
            # Record source comments
            if line[:2] == '#.':
                self._message.source_comment += line[2:].strip() + '\n'
                return None
            # Record comments
            self._message.comment += line[1:] + '\n'
            return None

        # Now we are in a msgctxt or msgid section, output previous section
        if line.startswith('msgid_plural'):
            if self._section != 'msgid':
                raise TranslationFormatSyntaxError(
                    line_number=self._lineno,
                    message="Unexpected keyword: msgid_plural")
            self._dumpCurrentSection()
            self._section = 'msgid_plural'
            line = line[len('msgid_plural'):]
        elif line.startswith('msgctxt'):
            if (self._section is not None
                    and (self._section == 'msgctxt'
                         or self._section.startswith('msgid'))):
                raise TranslationFormatSyntaxError(
                    line_number=self._lineno,
                    message="Unexpected keyword: msgctxt")
            self._section = 'msgctxt'
            line = line[len('msgctxt'):]
        elif line.startswith('msgid'):
            if (self._section is not None
                    and self._section.startswith('msgid')):
                raise TranslationFormatSyntaxError(
                    line_number=self._lineno,
                    message="Unexpected keyword: msgid")
            if self._section is not None:
                self._dumpCurrentSection()
            self._section = 'msgid'
            line = line[len('msgid'):]
            self._plural_case = None
        # Now we are in a msgstr section
        elif line.startswith('msgstr'):
            self._dumpCurrentSection()
            self._section = 'msgstr'
            line = line[len('msgstr'):]
            # XXX kiko 2005-08-19: if line is empty, it means we got an msgstr
            # followed by a newline; that may be critical, but who knows?
            if line.startswith('['):
                # Plural case
                new_plural_case, line = line[1:].split(']', 1)

                try:
                    new_plural_case = int(new_plural_case)
                except ValueError:
                    # Trigger "invalid plural case number" error.
                    new_plural_case = -1

                if new_plural_case < 0:
                    raise TranslationFormatSyntaxError(
                        line_number=self._lineno,
                        message="Invalid plural case number.")
                elif new_plural_case >= TranslationConstants.MAX_PLURAL_FORMS:
                    raise TranslationFormatSyntaxError(
                        line_number=self._lineno,
                        message="Unsupported plural case number.")

                if (self._plural_case is not None) and (new_plural_case !=
                                                        self._plural_case + 1):
                    self._emitSyntaxWarning("Bad plural case number.")
                if new_plural_case != self._plural_case:
                    self._plural_case = new_plural_case
                else:
                    self._emitSyntaxWarning(
                        "msgstr[] repeats same plural case number.")
            else:
                self._plural_case = TranslationConstants.SINGULAR_FORM
        elif self._section is None:
            raise TranslationFormatSyntaxError(line_number=self._lineno,
                                               message='Invalid content: %r' %
                                               original_line)
        else:
            # This line could be the continuation of a previous section.
            pass

        line = line.strip()
        if len(line) == 0:
            self._emitSyntaxWarning(
                "Line has no content; this is not supported by some "
                "implementations of msgfmt.")
        return line
 def test_duplicateTranslation(self):
     # Providing multiple translations for the same form is an error.
     data = TranslationMessageData()
     data.addTranslation(0, 'singular')
     self.assertRaises(TranslationFormatSyntaxError, data.addTranslation, 0,
                       'ralugnis')
    def _parseFreshLine(self, line, original_line):
        """Parse a new line (not a continuation after escaped newline).

        :param line: Remaining part of input line.
        :param original_line: Line as it originally was on input.
        :return: If there is one, the first line of a quoted string belonging
            to the line's section.  Otherwise, None.
        """
        is_obsolete = False
        if line.startswith('#~'):
            if line.startswith('#~|'):
                # This is an old msgid for an obsolete message.
                return None
            else:
                is_obsolete = True
                line = line[2:].lstrip()
                if len(line) == 0:
                    return None

        # If we get a comment line after a msgstr or a line starting with
        # msgid or msgctxt, this is a new entry.
        if ((line.startswith('#') or line.startswith('msgid') or
            line.startswith('msgctxt')) and self._section == 'msgstr'):
            if self._message is None:
                # first entry - do nothing.
                pass
            elif self._message.msgid_singular:
                self._dumpCurrentSection()
                self._storeCurrentMessage()
            elif self._translation_file.header is None:
                # When there is no msgid in the parsed message, it's the
                # header for this file.
                self._dumpCurrentSection()
                self._parseHeader(
                    self._message.translations[
                        TranslationConstants.SINGULAR_FORM],
                    self._message.comment)
            else:
                self._emitSyntaxWarning("We got a second header.")

            # Start a new message.
            self._message = TranslationMessageData()
            self._message_lineno = self._lineno
            self._section = None
            self._plural_case = None
            self._parsed_content = u''

        if self._message is not None:
            # Record whether the message is obsolete.
            self._message.is_obsolete = is_obsolete

        if line[0] == '#':
            # Record flags
            if line[:2] == '#,':
                new_flags = [flag.strip() for flag in line[2:].split(',')]
                self._message.flags.update(new_flags)
                return None
            # Record file references
            if line[:2] == '#:':
                if self._message.file_references:
                    # There is already a file reference, let's split it from
                    # the new one with a new line char.
                    self._message.file_references += '\n'
                self._message.file_references += line[2:].strip()
                return None
            # Record source comments
            if line[:2] == '#.':
                self._message.source_comment += line[2:].strip() + '\n'
                return None
            # Record comments
            self._message.comment += line[1:] + '\n'
            return None

        # Now we are in a msgctxt or msgid section, output previous section
        if line.startswith('msgid_plural'):
            if self._section != 'msgid':
                raise TranslationFormatSyntaxError(
                    line_number=self._lineno,
                    message="Unexpected keyword: msgid_plural")
            self._dumpCurrentSection()
            self._section = 'msgid_plural'
            line = line[len('msgid_plural'):]
        elif line.startswith('msgctxt'):
            if (self._section is not None and
                (self._section == 'msgctxt' or
                 self._section.startswith('msgid'))):
                raise TranslationFormatSyntaxError(
                    line_number=self._lineno,
                    message="Unexpected keyword: msgctxt")
            self._section = 'msgctxt'
            line = line[len('msgctxt'):]
        elif line.startswith('msgid'):
            if (self._section is not None and
                self._section.startswith('msgid')):
                raise TranslationFormatSyntaxError(
                    line_number=self._lineno,
                    message="Unexpected keyword: msgid")
            if self._section is not None:
                self._dumpCurrentSection()
            self._section = 'msgid'
            line = line[len('msgid'):]
            self._plural_case = None
        # Now we are in a msgstr section
        elif line.startswith('msgstr'):
            self._dumpCurrentSection()
            self._section = 'msgstr'
            line = line[len('msgstr'):]
            # XXX kiko 2005-08-19: if line is empty, it means we got an msgstr
            # followed by a newline; that may be critical, but who knows?
            if line.startswith('['):
                # Plural case
                new_plural_case, line = line[1:].split(']', 1)

                try:
                    new_plural_case = int(new_plural_case)
                except ValueError:
                    # Trigger "invalid plural case number" error.
                    new_plural_case = -1

                if new_plural_case < 0:
                    raise TranslationFormatSyntaxError(
                        line_number=self._lineno,
                        message="Invalid plural case number.")
                elif new_plural_case >= TranslationConstants.MAX_PLURAL_FORMS:
                    raise TranslationFormatSyntaxError(
                        line_number=self._lineno,
                        message="Unsupported plural case number.")

                if (self._plural_case is not None) and (
                        new_plural_case != self._plural_case + 1):
                    self._emitSyntaxWarning("Bad plural case number.")
                if new_plural_case != self._plural_case:
                    self._plural_case = new_plural_case
                else:
                    self._emitSyntaxWarning(
                        "msgstr[] repeats same plural case number.")
            else:
                self._plural_case = TranslationConstants.SINGULAR_FORM
        elif self._section is None:
            raise TranslationFormatSyntaxError(
                line_number=self._lineno,
                message='Invalid content: %r' % original_line)
        else:
            # This line could be the continuation of a previous section.
            pass

        line = line.strip()
        if len(line) == 0:
            self._emitSyntaxWarning(
                "Line has no content; this is not supported by some "
                "implementations of msgfmt.")
        return line
 def test_emptyTranslations(self):
     # TranslationMessageData starts out as an empty message.
     data = TranslationMessageData()
     self.assertEqual(data.translations, [])
 def test_addTranslationMulti(self):
     # Regular multi-form translation.
     data = TranslationMessageData()
     data.addTranslation(0, 'singular')
     data.addTranslation(1, 'plural')
     self.assertEqual(data.translations, ['singular', 'plural'])
 def test_addTranslationReversed(self):
     # Translate to multiple forms, but in a strange order.
     data = TranslationMessageData()
     data.addTranslation(1, 'plural')
     data.addTranslation(0, 'singular')
     self.assertEqual(data.translations, ['singular', 'plural'])
    def parse(self, content_text):
        """Parse string as a PO file."""
        # Initialize the parser.
        self._translation_file = TranslationFileData()
        self._messageids = set()
        self._pending_chars = content_text
        self._pending_unichars = u''
        self._lineno = 0
        # Message specific variables.
        self._message = TranslationMessageData()
        self._message_lineno = self._lineno
        self._section = None
        self._plural_case = None
        self._parsed_content = u''

        # First thing to do is to get the charset used in the content_text.
        charset = parse_charset(content_text)

        # Now, parse the header, inefficiently. It ought to be short, so
        # this isn't disastrous.
        line = self._getHeaderLine()
        while line is not None:
            self._parseLine(line.decode(charset))
            if (self._translation_file.header is not None or
                self._message.msgid_singular):
                # Either found the header already or it's a message with a
                # non empty msgid which means is not a header.
                break
            line = self._getHeaderLine()

        if line is None:
            if (self._translation_file.header is None and
                not self._message.msgid_singular):
                # This file contains no actual messages.
                self._dumpCurrentSection()

                # It may contain a header though.
                if not self._message.translations:
                    raise TranslationFormatSyntaxError(
                        message="File contains no messages.")
                self._parseHeader(
                    self._message.translations[
                        TranslationConstants.SINGULAR_FORM],
                    self._message.comment)

            # There is nothing left to parse.
            return self._translation_file

        # Parse anything left all in one go.
        lines = re.split(r'\n|\r\n|\r', self._pending_unichars)
        for line in lines:
            self._parseLine(line)

        if self._translation_file.header is None:
            raise TranslationFormatSyntaxError(
                message='No header found in this pofile')

        if self._message is not None:
            # We need to dump latest message.
            if self._section is None:
                # The message has not content or it's just a comment, ignore
                # it.
                return self._translation_file
            elif self._section == 'msgstr':
                self._dumpCurrentSection()
                self._storeCurrentMessage()
            else:
                raise TranslationFormatSyntaxError(
                    line_number = self._lineno,
                    message='Got a truncated message!')

        return self._translation_file
    def _fetchDBRows(self, simulate_timeout=False):
        msgstr_joins = [
            "LEFT OUTER JOIN POTranslation AS pt%d "
            "ON pt%d.id = TranslationMessage.msgstr%d" % (form, form, form)
            for form in xrange(TranslationConstants.MAX_PLURAL_FORMS)]

        translations = [
            "pt%d.translation AS translation%d" % (form, form)
            for form in xrange(TranslationConstants.MAX_PLURAL_FORMS)]

        substitutions = {
            'translation_columns': ', '.join(translations),
            'translation_joins': '\n'.join(msgstr_joins),
            'language': quote(self.pofile.language),
            'potemplate': quote(self.pofile.potemplate),
            'flag': self._getFlagName(),
        }

        sql = """
            SELECT
                POMsgId.msgid AS msgid,
                POMsgID_Plural.msgid AS msgid_plural,
                context,
                date_reviewed,
                %(translation_columns)s
            FROM POTMsgSet
            JOIN TranslationTemplateItem ON
                TranslationTemplateItem.potmsgset = POTMsgSet.id AND
                TranslationTemplateItem.potemplate = %(potemplate)s
            JOIN TranslationMessage ON
                POTMsgSet.id=TranslationMessage.potmsgset AND (
                    TranslationMessage.potemplate = %(potemplate)s OR
                    TranslationMessage.potemplate IS NULL) AND
                TranslationMessage.language = %(language)s
            %(translation_joins)s
            JOIN POMsgID ON
                POMsgID.id = POTMsgSet.msgid_singular
            LEFT OUTER JOIN POMsgID AS POMsgID_Plural ON
                POMsgID_Plural.id = POTMsgSet.msgid_plural
            WHERE
                %(flag)s IS TRUE
            ORDER BY
                TranslationTemplateItem.sequence,
                TranslationMessage.potemplate NULLS LAST
          """ % substitutions

        cur = cursor()
        try:
            # XXX JeroenVermeulen 2010-11-24 bug=680802: We set a
            # timeout to work around bug 408718, but the query is
            # simpler now.  See if we still need this.

            # We have to commit what we've got so far or we'll lose
            # it when we hit TimeoutError.
            transaction.commit()

            if simulate_timeout:
                # This is used in tests.
                timeout = '1ms'
                query = "SELECT pg_sleep(2)"
            else:
                timeout = 1000 * int(config.poimport.statement_timeout)
                query = sql
            cur.execute("SET statement_timeout to %s" % quote(timeout))
            cur.execute(query)
        except TimeoutError:
            # XXX JeroenVermeulen 2010-11-24 bug=680802: Log this so we
            # know whether it still happens.
            transaction.abort()
            return

        rows = cur.fetchall()

        assert TranslationConstants.MAX_PLURAL_FORMS == 6, (
            "Change this code to support %d plural forms"
            % TranslationConstants.MAX_PLURAL_FORMS)
        for row in rows:
            msgid, msgid_plural, context, date = row[:4]
            # The last part of the row is msgstr0 .. msgstr5. Store them
            # in a dict indexed by the number of the plural form.
            msgstrs = dict(enumerate(row[4:]))

            key = (msgid, msgid_plural, context)
            if key in self.current_messages:
                message = self.current_messages[key]
            else:
                message = TranslationMessageData()
                self.current_messages[key] = message

                message.context = context
                message.msgid_singular = msgid
                message.msgid_plural = msgid_plural

            for plural in xrange(TranslationConstants.MAX_PLURAL_FORMS):
                msgstr = msgstrs.get(plural, None)
                if (msgstr is not None and
                    ((len(message.translations) > plural and
                      message.translations[plural] is None) or
                     (len(message.translations) <= plural))):
                    message.addTranslation(plural, msgstr)
 def test_resetAllTranslations(self):
     # resetAllTranslations clears the message's translations.
     data = TranslationMessageData()
     data.addTranslation(0, 'singular')
     data.resetAllTranslations()
     self.assertEqual(data.translations, [])
Exemple #28
0
    def _fetchDBRows(self, simulate_timeout=False):
        msgstr_joins = [
            "LEFT OUTER JOIN POTranslation AS pt%d "
            "ON pt%d.id = TranslationMessage.msgstr%d" % (form, form, form)
            for form in xrange(TranslationConstants.MAX_PLURAL_FORMS)
        ]

        translations = [
            "pt%d.translation AS translation%d" % (form, form)
            for form in xrange(TranslationConstants.MAX_PLURAL_FORMS)
        ]

        substitutions = {
            'translation_columns': ', '.join(translations),
            'translation_joins': '\n'.join(msgstr_joins),
            'language': quote(self.pofile.language),
            'potemplate': quote(self.pofile.potemplate),
            'flag': self._getFlagName(),
        }

        sql = """
            SELECT
                POMsgId.msgid AS msgid,
                POMsgID_Plural.msgid AS msgid_plural,
                context,
                date_reviewed,
                %(translation_columns)s
            FROM POTMsgSet
            JOIN TranslationTemplateItem ON
                TranslationTemplateItem.potmsgset = POTMsgSet.id AND
                TranslationTemplateItem.potemplate = %(potemplate)s
            JOIN TranslationMessage ON
                POTMsgSet.id=TranslationMessage.potmsgset AND (
                    TranslationMessage.potemplate = %(potemplate)s OR
                    TranslationMessage.potemplate IS NULL) AND
                TranslationMessage.language = %(language)s
            %(translation_joins)s
            JOIN POMsgID ON
                POMsgID.id = POTMsgSet.msgid_singular
            LEFT OUTER JOIN POMsgID AS POMsgID_Plural ON
                POMsgID_Plural.id = POTMsgSet.msgid_plural
            WHERE
                %(flag)s IS TRUE
            ORDER BY
                TranslationTemplateItem.sequence,
                TranslationMessage.potemplate NULLS LAST
          """ % substitutions

        cur = cursor()
        try:
            # XXX JeroenVermeulen 2010-11-24 bug=680802: We set a
            # timeout to work around bug 408718, but the query is
            # simpler now.  See if we still need this.

            # We have to commit what we've got so far or we'll lose
            # it when we hit TimeoutError.
            transaction.commit()

            if simulate_timeout:
                # This is used in tests.
                timeout = '1ms'
                query = "SELECT pg_sleep(2)"
            else:
                timeout = 1000 * int(config.poimport.statement_timeout)
                query = sql
            cur.execute("SET statement_timeout to %s" % quote(timeout))
            cur.execute(query)
        except TimeoutError:
            # XXX JeroenVermeulen 2010-11-24 bug=680802: Log this so we
            # know whether it still happens.
            transaction.abort()
            return

        rows = cur.fetchall()

        assert TranslationConstants.MAX_PLURAL_FORMS == 6, (
            "Change this code to support %d plural forms" %
            TranslationConstants.MAX_PLURAL_FORMS)
        for row in rows:
            msgid, msgid_plural, context, date = row[:4]
            # The last part of the row is msgstr0 .. msgstr5. Store them
            # in a dict indexed by the number of the plural form.
            msgstrs = dict(enumerate(row[4:]))

            key = (msgid, msgid_plural, context)
            if key in self.current_messages:
                message = self.current_messages[key]
            else:
                message = TranslationMessageData()
                self.current_messages[key] = message

                message.context = context
                message.msgid_singular = msgid
                message.msgid_plural = msgid_plural

            for plural in xrange(TranslationConstants.MAX_PLURAL_FORMS):
                msgstr = msgstrs.get(plural, None)
                if (msgstr is not None
                        and ((len(message.translations) > plural
                              and message.translations[plural] is None) or
                             (len(message.translations) <= plural))):
                    message.addTranslation(plural, msgstr)
Exemple #29
0
    def testIsIdenticalTranslation(self):
        """Test `is_identical_translation`."""
        msg1 = TranslationMessageData()
        msg2 = TranslationMessageData()
        msg1.msgid_singular = "foo"
        msg2.msgid_singular = "foo"

        self.assertTrue(
            is_identical_translation(msg1, msg2),
            "Two blank translation messages do not evaluate as identical.")

        msg1.msgid_plural = "foos"
        self.assertFalse(
            is_identical_translation(msg1, msg2),
            "Message with fewer plural forms is accepted as identical.")
        msg2.msgid_plural = "splat"
        self.assertFalse(
            is_identical_translation(msg1, msg2),
            "Messages with different plurals accepted as identical.")
        msg2.msgid_plural = "foos"
        self.assertTrue(
            is_identical_translation(msg1, msg2),
            "Messages with identical plural forms not accepted as identical.")

        msg1._translations = ["le foo"]
        self.assertFalse(
            is_identical_translation(msg1, msg2),
            "Failed to distinguish translated message from untranslated one.")
        msg2._translations = ["le foo"]
        self.assertTrue(is_identical_translation(msg1, msg2),
                        "Identical translations not accepted as identical.")

        msg1._translations = ["le foo", "les foos"]
        self.assertFalse(
            is_identical_translation(msg1, msg2),
            "Failed to distinguish message with missing plural translation.")
        msg2._translations = ["le foo", "les foos"]
        self.assertTrue(
            is_identical_translation(msg1, msg2),
            "Identical plural translations not accepted as equal.")

        msg1._translations = ["le foo", "les foos", "beaucoup des foos"]
        self.assertFalse(
            is_identical_translation(msg1, msg2),
            "Failed to distinguish message with extra plural translations.")
        msg2._translations = ["le foo", "les foos", "beaucoup des foos", None]
        self.assertTrue(
            is_identical_translation(msg1, msg2),
            "Identical multi-form messages not accepted as identical.")
    def parse(self, content):
        """Parse given content as a property file.

        Once the parse is done, self.messages has a list of the available
        `ITranslationMessageData`s.
        """

        # .properties files are supposed to be unicode-escaped, but we know
        # that there are some .xpi language packs that instead, use UTF-8.
        # That's against the specification, but Mozilla applications accept
        # it anyway, so we try to support it too.
        # To do this support, we read the text as being in UTF-8
        # because unicode-escaped looks like ASCII files.
        try:
            content = content.decode('utf-8')
        except UnicodeDecodeError:
            raise TranslationFormatInvalidInputError, (
                'Content is not valid unicode-escaped text')

        line_num = 0
        is_multi_line_comment = False
        last_comment = None
        last_comment_line_num = 0
        ignore_comment = False
        is_message = False
        translation = u''
        for line in content.splitlines():
            # Now, to "normalize" all to the same encoding, we encode to
            # unicode-escape first, and then decode it to unicode
            # XXX: Danilo 2006-08-01: we _might_ get performance
            # improvements if we reimplement this to work directly,
            # though, it will be hard to beat C-based de/encoder.
            # This call unescapes everything so we don't need to care about
            # quotes escaping.
            try:
                string = line.encode('raw-unicode_escape')
                line = string.decode('unicode_escape')
            except UnicodeDecodeError as exception:
                raise TranslationFormatInvalidInputError(
                    filename=self.filename, line_number=line_num,
                    message=str(exception))

            line_num += 1
            if not is_multi_line_comment:
                # Remove any white space before the useful data, like
                # ' # foo'.
                line = line.lstrip()
                if len(line) == 0:
                    # It's an empty line. Reset any previous comment we have.
                    last_comment = None
                    last_comment_line_num = 0
                    ignore_comment = False
                elif line.startswith(u'#') or line.startswith(u'//'):
                    # It's a whole line comment.
                    ignore_comment = False
                    line = line[1:].strip()
                    if last_comment:
                        last_comment += line
                    elif len(line) > 0:
                        last_comment = line

                    if last_comment and not last_comment.endswith('\n'):
                        # Comments must end always with a new line.
                        last_comment += '\n'

                    last_comment_line_num = line_num
                    continue

            # Unescaped URLs are a common mistake: the "//" starts an
            # end-of-line comment.  To work around that, treat "://" as
            # a special case.
            just_saw_colon = False

            while line:
                if is_multi_line_comment:
                    if line.startswith(u'*/'):
                        # The comment ended, we jump the closing tag and
                        # continue with the parsing.
                        line = line[2:]
                        is_multi_line_comment = False
                        last_comment_line_num = line_num
                        if ignore_comment:
                            last_comment = None
                            ignore_comment = False

                        # Comments must end always with a new line.
                        last_comment += '\n'
                    elif line.startswith(self.license_block_text):
                        # It's a comment with a licence notice, this
                        # comment can be ignored.
                        ignore_comment = True
                        # Jump the whole tag
                        line = line[len(self.license_block_text):]
                    else:
                        # Store the character.
                        if last_comment is None:
                            last_comment = line[0]
                        elif last_comment_line_num == line_num:
                            last_comment += line[0]
                        else:
                            last_comment = u'%s\n%s' % (last_comment, line[0])
                            last_comment_line_num = line_num
                        # Jump the processed char.
                        line = line[1:]
                    continue
                elif line.startswith(u'/*'):
                    # It's a multi line comment
                    is_multi_line_comment = True
                    ignore_comment = False
                    last_comment_line_num = line_num
                    # Jump the comment starting tag
                    line = line[2:]
                    continue
                elif line.startswith(u'//') and not just_saw_colon:
                    # End-of-line comment.
                    last_comment = '%s\n' % line[2:].strip()
                    last_comment_line_num = line_num
                    # On to next line.
                    break
                elif is_message:
                    # Store the char and continue.
                    head_char = line[0]
                    translation += head_char
                    line = line[1:]
                    just_saw_colon = (head_char == ':')
                    continue
                elif u'=' in line:
                    # Looks like a message string.
                    (key, value) = line.split('=', 1)
                    # Remove leading and trailing white spaces.
                    key = key.strip()

                    if valid_property_msgid(key):
                        is_message = True
                        # Jump the msgid, control chars and leading white
                        # space.
                        line = value.lstrip()
                        continue
                    else:
                        raise TranslationFormatSyntaxError(
                            line_number=line_num,
                            message=u"invalid msgid: '%s'" % key)
                else:
                    # Got a line that is not a valid message nor a valid
                    # comment. Ignore it because main en-US.xpi catalog from
                    # Firefox has such line/error. We follow the 'be strict
                    # with what you export, be permisive with what you import'
                    # policy.
                    break
            if is_message:
                # We just parsed a message, so we need to add it to the list
                # of messages.
                if ignore_comment or last_comment_line_num < line_num - 1:
                    # We must ignore the comment or either the comment is not
                    # the last thing before this message or is not in the same
                    # line as this message.
                    last_comment = None
                    ignore_comment = False

                message = TranslationMessageData()
                message.msgid_singular = key
                message.context = self.chrome_path
                message.file_references_list = [
                    "%s:%d(%s)" % (self.filename, line_num, key)]
                value = translation.strip()
                message.addTranslation(
                    TranslationConstants.SINGULAR_FORM, value)
                message.singular_text = value
                message.source_comment = last_comment
                self.messages.append(message)

                # Reset status vars.
                last_comment = None
                last_comment_line_num = 0
                is_message = False
                translation = u''
Exemple #31
0
class POParser(object):
    """Parser class for Gettext files."""
    def __init__(self, plural_formula=None):
        self._translation_file = None
        self._lineno = 0
        # This is a default plural form mapping (i.e. no mapping) when
        # no header is present in the PO file.
        self._plural_form_mapping = make_plurals_identity_map()
        self._expected_plural_formula = plural_formula

        # Marks when we're parsing a continuation of a string after an escaped
        # newline.
        self._escaped_line_break = False

    def _emitSyntaxWarning(self, message):
        warning = POSyntaxWarning(message, line_number=self._lineno)
        if self._translation_file:
            self._translation_file.syntax_warnings.append(unicode(warning))

    def _decode(self):
        # is there anything to convert?
        if not self._pending_chars:
            return

        # if the PO header hasn't been parsed, then we don't know the
        # encoding yet
        if self._translation_file.header is None:
            return

        charset = self._translation_file.header.charset
        decode = codecs.getdecoder(charset)
        # decode as many characters as we can:
        try:
            newchars, length = decode(self._pending_chars, 'strict')
        except UnicodeDecodeError as exc:
            # XXX: James Henstridge 2006-03-16:
            # If the number of unconvertable chars is longer than a
            # multibyte sequence to be, the UnicodeDecodeError indicates
            # a real error, rather than a partial read.
            # I don't know what the longest multibyte sequence in the
            # encodings we need to support, but it shouldn't be more
            # than 10 bytes ...
            if len(self._pending_chars) - exc.start > 10:
                raise TranslationFormatInvalidInputError(
                    line_number=self._lineno,
                    message="Could not decode input from %s" % charset)
            newchars, length = decode(self._pending_chars[:exc.start],
                                      'strict')
        self._pending_unichars += newchars
        self._pending_chars = self._pending_chars[length:]

    def _getHeaderLine(self):
        if self._translation_file.header is not None:
            # We know what charset the data is in, as we've already
            # parsed the header.  However, we're going to handle this
            # more efficiently, so we don't want to use _getHeaderLine
            # except for parsing the header.
            raise AssertionError('using _getHeaderLine after header is parsed')

        # We don't know what charset the data is in, so we parse it one line
        # at a time until we have the header, and then we'll know how to
        # treat the rest of the data.
        parts = re.split(r'\n|\r\n|\r', self._pending_chars, 1)
        if len(parts) == 1:
            # only one line
            return None
        line, self._pending_chars = parts
        return line.strip()

    def parse(self, content_text):
        """Parse string as a PO file."""
        # Initialize the parser.
        self._translation_file = TranslationFileData()
        self._messageids = set()
        self._pending_chars = content_text
        self._pending_unichars = u''
        self._lineno = 0
        # Message specific variables.
        self._message = TranslationMessageData()
        self._message_lineno = self._lineno
        self._section = None
        self._plural_case = None
        self._parsed_content = u''

        # First thing to do is to get the charset used in the content_text.
        charset = parse_charset(content_text)

        # Now, parse the header, inefficiently. It ought to be short, so
        # this isn't disastrous.
        line = self._getHeaderLine()
        while line is not None:
            self._parseLine(line.decode(charset))
            if (self._translation_file.header is not None
                    or self._message.msgid_singular):
                # Either found the header already or it's a message with a
                # non empty msgid which means is not a header.
                break
            line = self._getHeaderLine()

        if line is None:
            if (self._translation_file.header is None
                    and not self._message.msgid_singular):
                # This file contains no actual messages.
                self._dumpCurrentSection()

                # It may contain a header though.
                if not self._message.translations:
                    raise TranslationFormatSyntaxError(
                        message="File contains no messages.")
                self._parseHeader(
                    self._message.translations[
                        TranslationConstants.SINGULAR_FORM],
                    self._message.comment)

            # There is nothing left to parse.
            return self._translation_file

        # Parse anything left all in one go.
        lines = re.split(r'\n|\r\n|\r', self._pending_unichars)
        for line in lines:
            self._parseLine(line)

        if self._translation_file.header is None:
            raise TranslationFormatSyntaxError(
                message='No header found in this pofile')

        if self._message is not None:
            # We need to dump latest message.
            if self._section is None:
                # The message has not content or it's just a comment, ignore
                # it.
                return self._translation_file
            elif self._section == 'msgstr':
                self._dumpCurrentSection()
                self._storeCurrentMessage()
            else:
                raise TranslationFormatSyntaxError(
                    line_number=self._lineno,
                    message='Got a truncated message!')

        return self._translation_file

    def _storeCurrentMessage(self):
        if self._message is not None:
            msgkey = self._message.msgid_singular
            if self._message.context is not None:
                msgkey = '%s\2%s' % (self._message.context, msgkey)
            if msgkey in self._messageids:
                # We use '%r' instead of '%d' because there are situations
                # when it returns an "<unprintable instance object>". You can
                # see more details on bug #2896
                raise TranslationFormatInvalidInputError(
                    message='PO file: duplicate msgid ending on line %r' %
                    (self._message_lineno))

            number_plural_forms = (
                self._translation_file.header.number_plural_forms)
            if (self._message.msgid_plural
                    and len(self._message.translations) < number_plural_forms):
                # Has plural forms but the number of translations is lower.
                # Fill the others with an empty string.
                for index in range(len(self._message.translations),
                                   number_plural_forms):
                    self._message.addTranslation(index, u'')

            self._translation_file.messages.append(self._message)
            self._messageids.add(msgkey)
            self._message = None

    def _parseHeader(self, header_text, header_comment):
        try:
            header = POHeader(header_text, header_comment)
            self._translation_file.header = header
            self._translation_file.syntax_warnings += header.syntax_warnings
        except TranslationFormatInvalidInputError as error:
            if error.line_number is None:
                error.line_number = self._message_lineno
            raise
        self._translation_file.header.is_fuzzy = ('fuzzy'
                                                  in self._message.flags)

        if self._translation_file.messages:
            self._emitSyntaxWarning("Header entry is not first entry.")

        plural_formula = self._translation_file.header.plural_form_expression
        if plural_formula is None:
            # We default to a simple plural formula which uses
            # a single form for translations.
            plural_formula = '0'
        self._plural_form_mapping = plural_form_mapper(
            plural_formula, self._expected_plural_formula)
        # convert buffered input to the encoding specified in the PO header
        self._decode()

    def _unescapeNumericCharSequence(self, string):
        """Unescape leading sequence of escaped numeric character codes.

        This is for characters given in hexadecimal or octal escape notation.

        :return: a tuple: first, any leading part of `string` as an unescaped
            string (empty if `string` did not start with a numeric escape
            sequence), and second, the remainder of `string` after the leading
            numeric escape sequences have been parsed.
        """
        escaped_string = ''
        position = 0
        length = len(string)
        while position + 1 < length and string[position] == '\\':
            # Handle escaped characters given as numeric character codes.
            # These will still be in the original encoding.  We extract the
            # whole sequence of escaped chars to recode them later into
            # Unicode in a single call.
            lead_char = string[position + 1]
            if lead_char == 'x':
                # Hexadecimal escape.
                position += 4
            elif lead_char.isdigit():
                # Octal escape.
                position += 2
                # Up to two more octal digits.
                for i in range(2):
                    if string[position].isdigit():
                        position += 1
                    else:
                        break
            elif lead_char in ESCAPE_MAP:
                # It's part of our mapping table, we ignore it here.
                break
            else:
                raise TranslationFormatSyntaxError(
                    line_number=self._lineno,
                    message=("Unknown escape sequence %s" %
                             string[position:position + 2]))

        if position == 0:
            # No escaping to be done.
            return '', string

        # We found some text escaped that should be recoded to Unicode.
        # First, we unescape it.
        escaped_string, string = string[:position], string[position:]
        unescaped_string = escaped_string.decode('string-escape')

        if (self._translation_file is not None
                and self._translation_file.header is not None):
            # There is a header, so we know the original encoding for
            # the given string.
            charset = self._translation_file.header.charset
            know_charset = True
        else:
            # We don't know the original encoding of the imported file so we
            # cannot get the right values.  We try ASCII.
            # XXX JeroenVermeulen 2008-02-08: might as well try UTF-8 here.
            # It's a superset, and anything that's not UTF-8 is very unlikely
            # to validate as UTF-8.
            charset = 'ascii'
            know_charset = False

        try:
            decoded_text = unescaped_string.decode(charset)
        except UnicodeDecodeError:
            if know_charset:
                message = ("Could not decode escaped string as %s: (%s)" %
                           (charset, escaped_string))
            else:
                message = ("Could not decode escaped string: (%s)" %
                           escaped_string)
            raise TranslationFormatInvalidInputError(line_number=self._lineno,
                                                     message=message)

        return decoded_text, string

    def _parseQuotedString(self, string):
        r"""Parse a quoted string, interpreting escape sequences.

          >>> parser = POParser()
          >>> parser._parseQuotedString(u'\"abc\"')
          u'abc'
          >>> parser._parseQuotedString(u'\"abc\\ndef\"')
          u'abc\ndef'
          >>> parser._parseQuotedString(u'\"ab\x63\"')
          u'abc'
          >>> parser._parseQuotedString(u'\"ab\143\"')
          u'abc'

          After the string has been converted to unicode, the backslash
          escaped sequences are still in the encoding that the charset header
          specifies. Such quoted sequences will be converted to unicode by
          this method.

          We don't know the encoding of the escaped characters and cannot be
          just recoded as Unicode so it's a TranslationFormatInvalidInputError
          >>> utf8_string = u'"view \\302\\253${version_title}\\302\\273"'
          >>> parser._parseQuotedString(utf8_string)
          Traceback (most recent call last):
          ...
          TranslationFormatInvalidInputError: Could not decode escaped string: (\302\253)

          Now, we note the original encoding so we get the right Unicode
          string.

          >>> class FakeHeader:
          ...     charset = 'UTF-8'
          >>> parser._translation_file = TranslationFileData()
          >>> parser._translation_file.header = FakeHeader()
          >>> parser._parseQuotedString(utf8_string)
          u'view \xab${version_title}\xbb'

          Let's see that we raise a TranslationFormatInvalidInputError
          exception when we have an escaped char that is not valid in the
          declared encoding of the original string:

          >>> iso8859_1_string = u'"foo \\xf9"'
          >>> parser._parseQuotedString(iso8859_1_string)
          Traceback (most recent call last):
          ...
          TranslationFormatInvalidInputError: Could not decode escaped string as UTF-8: (\xf9)

          An error will be raised if the entire string isn't contained in
          quotes properly:

          >>> parser._parseQuotedString(u'abc')
          Traceback (most recent call last):
            ...
          TranslationFormatSyntaxError: String is not quoted
          >>> parser._parseQuotedString(u'\"ab')
          Traceback (most recent call last):
            ...
          TranslationFormatSyntaxError: String not terminated
          >>> parser._parseQuotedString(u'\"ab\"x')
          Traceback (most recent call last):
            ...
          TranslationFormatSyntaxError: Extra content found after string: (x)
        """
        if self._escaped_line_break:
            # Continuing a line after an escaped newline.  Strip indentation.
            string = string.lstrip()
            self._escaped_line_break = False
        else:
            # Regular string.  Must start with opening quote, which we strip.
            if string[0] != '"':
                raise TranslationFormatSyntaxError(
                    line_number=self._lineno, message="String is not quoted")
            string = string[1:]

        output = ''
        while len(string) > 0:
            if string[0] == '"':
                # Reached the end of the quoted string.  It's rare, but there
                # may be another quoted string on the same line.  It should be
                # suffixed to what we already have, with any whitespace
                # between the strings removed.
                string = string[1:].lstrip()
                if len(string) == 0:
                    # End of line, end of string: the normal case
                    break
                if string[0] == '"':
                    # Start of a new string.  We've already swallowed the
                    # closing quote and any intervening whitespace; now
                    # swallow the re-opening quote and go on as if the string
                    # just went on normally
                    string = string[1:]
                    continue

                # if there is any non-string data afterwards, raise an
                # exception
                if len(string) > 0 and not string.isspace():
                    raise TranslationFormatSyntaxError(
                        line_number=self._lineno,
                        message=("Extra content found after string: (%s)" %
                                 string))
                break
            elif string[0] == '\\':
                if len(string) == 1:
                    self._escaped_line_break = True
                    string = ''
                    break
                elif string[1] in ESCAPE_MAP:
                    # We got one of the special escaped chars we know about.
                    # Unescape it using the mapping table.
                    output += ESCAPE_MAP[string[1]]
                    string = string[2:]
                else:
                    unescaped, string = (
                        self._unescapeNumericCharSequence(string))
                    output += unescaped
            else:
                # Normal text.  Eat up as much as we can in one go.
                text = re.match(STRAIGHT_TEXT_RUN, string)
                output += text.group()
                zero, runlength = text.span()
                string = string[runlength:]
        else:
            # We finished parsing the string without finding the ending quote
            # char.
            raise TranslationFormatSyntaxError(line_number=self._lineno,
                                               message="String not terminated")

        return output

    def _dumpCurrentSection(self):
        """Dump current parsed content inside the translation message."""
        if self._section is None:
            # There is nothing to dump.
            return
        elif self._section == 'msgctxt':
            self._message.context = self._parsed_content
        elif self._section == 'msgid':
            self._message.msgid_singular = self._parsed_content
        elif self._section == 'msgid_plural':
            self._message.msgid_plural = self._parsed_content
            # Note in the header that there are plural forms.
            self._translation_file.header.has_plural_forms = True
        elif self._section == 'msgstr':
            if self._message.msgid_plural is not None:
                self._message.addTranslation(
                    self._plural_form_mapping[self._plural_case],
                    self._parsed_content)
            else:
                self._message.addTranslation(self._plural_case,
                                             self._parsed_content)
        else:
            raise AssertionError('Unknown section %s' % self._section)

        self._parsed_content = u''

    def _parseFreshLine(self, line, original_line):
        """Parse a new line (not a continuation after escaped newline).

        :param line: Remaining part of input line.
        :param original_line: Line as it originally was on input.
        :return: If there is one, the first line of a quoted string belonging
            to the line's section.  Otherwise, None.
        """
        is_obsolete = False
        if line.startswith('#~'):
            if line.startswith('#~|'):
                # This is an old msgid for an obsolete message.
                return None
            else:
                is_obsolete = True
                line = line[2:].lstrip()
                if len(line) == 0:
                    return None

        # If we get a comment line after a msgstr or a line starting with
        # msgid or msgctxt, this is a new entry.
        if ((line.startswith('#') or line.startswith('msgid')
             or line.startswith('msgctxt')) and self._section == 'msgstr'):
            if self._message is None:
                # first entry - do nothing.
                pass
            elif self._message.msgid_singular:
                self._dumpCurrentSection()
                self._storeCurrentMessage()
            elif self._translation_file.header is None:
                # When there is no msgid in the parsed message, it's the
                # header for this file.
                self._dumpCurrentSection()
                self._parseHeader(
                    self._message.translations[
                        TranslationConstants.SINGULAR_FORM],
                    self._message.comment)
            else:
                self._emitSyntaxWarning("We got a second header.")

            # Start a new message.
            self._message = TranslationMessageData()
            self._message_lineno = self._lineno
            self._section = None
            self._plural_case = None
            self._parsed_content = u''

        if self._message is not None:
            # Record whether the message is obsolete.
            self._message.is_obsolete = is_obsolete

        if line[0] == '#':
            # Record flags
            if line[:2] == '#,':
                new_flags = [flag.strip() for flag in line[2:].split(',')]
                self._message.flags.update(new_flags)
                return None
            # Record file references
            if line[:2] == '#:':
                if self._message.file_references:
                    # There is already a file reference, let's split it from
                    # the new one with a new line char.
                    self._message.file_references += '\n'
                self._message.file_references += line[2:].strip()
                return None
            # Record source comments
            if line[:2] == '#.':
                self._message.source_comment += line[2:].strip() + '\n'
                return None
            # Record comments
            self._message.comment += line[1:] + '\n'
            return None

        # Now we are in a msgctxt or msgid section, output previous section
        if line.startswith('msgid_plural'):
            if self._section != 'msgid':
                raise TranslationFormatSyntaxError(
                    line_number=self._lineno,
                    message="Unexpected keyword: msgid_plural")
            self._dumpCurrentSection()
            self._section = 'msgid_plural'
            line = line[len('msgid_plural'):]
        elif line.startswith('msgctxt'):
            if (self._section is not None
                    and (self._section == 'msgctxt'
                         or self._section.startswith('msgid'))):
                raise TranslationFormatSyntaxError(
                    line_number=self._lineno,
                    message="Unexpected keyword: msgctxt")
            self._section = 'msgctxt'
            line = line[len('msgctxt'):]
        elif line.startswith('msgid'):
            if (self._section is not None
                    and self._section.startswith('msgid')):
                raise TranslationFormatSyntaxError(
                    line_number=self._lineno,
                    message="Unexpected keyword: msgid")
            if self._section is not None:
                self._dumpCurrentSection()
            self._section = 'msgid'
            line = line[len('msgid'):]
            self._plural_case = None
        # Now we are in a msgstr section
        elif line.startswith('msgstr'):
            self._dumpCurrentSection()
            self._section = 'msgstr'
            line = line[len('msgstr'):]
            # XXX kiko 2005-08-19: if line is empty, it means we got an msgstr
            # followed by a newline; that may be critical, but who knows?
            if line.startswith('['):
                # Plural case
                new_plural_case, line = line[1:].split(']', 1)

                try:
                    new_plural_case = int(new_plural_case)
                except ValueError:
                    # Trigger "invalid plural case number" error.
                    new_plural_case = -1

                if new_plural_case < 0:
                    raise TranslationFormatSyntaxError(
                        line_number=self._lineno,
                        message="Invalid plural case number.")
                elif new_plural_case >= TranslationConstants.MAX_PLURAL_FORMS:
                    raise TranslationFormatSyntaxError(
                        line_number=self._lineno,
                        message="Unsupported plural case number.")

                if (self._plural_case is not None) and (new_plural_case !=
                                                        self._plural_case + 1):
                    self._emitSyntaxWarning("Bad plural case number.")
                if new_plural_case != self._plural_case:
                    self._plural_case = new_plural_case
                else:
                    self._emitSyntaxWarning(
                        "msgstr[] repeats same plural case number.")
            else:
                self._plural_case = TranslationConstants.SINGULAR_FORM
        elif self._section is None:
            raise TranslationFormatSyntaxError(line_number=self._lineno,
                                               message='Invalid content: %r' %
                                               original_line)
        else:
            # This line could be the continuation of a previous section.
            pass

        line = line.strip()
        if len(line) == 0:
            self._emitSyntaxWarning(
                "Line has no content; this is not supported by some "
                "implementations of msgfmt.")
        return line

    def _parseLine(self, original_line):
        self._lineno += 1
        # Skip empty lines
        line = original_line.strip()
        if len(line) == 0:
            return

        if not self._escaped_line_break:
            line = self._parseFreshLine(line, original_line)
            if line is None or len(line) == 0:
                return

        line = self._parseQuotedString(line)

        text_section_types = ('msgctxt', 'msgid', 'msgid_plural', 'msgstr')
        if self._section not in text_section_types:
            raise TranslationFormatSyntaxError(line_number=self._lineno,
                                               message='Invalid content: %r' %
                                               original_line)

        self._parsed_content += line
Exemple #32
0
    def parse(self, content_text):
        """Parse string as a PO file."""
        # Initialize the parser.
        self._translation_file = TranslationFileData()
        self._messageids = set()
        self._pending_chars = content_text
        self._pending_unichars = u''
        self._lineno = 0
        # Message specific variables.
        self._message = TranslationMessageData()
        self._message_lineno = self._lineno
        self._section = None
        self._plural_case = None
        self._parsed_content = u''

        # First thing to do is to get the charset used in the content_text.
        charset = parse_charset(content_text)

        # Now, parse the header, inefficiently. It ought to be short, so
        # this isn't disastrous.
        line = self._getHeaderLine()
        while line is not None:
            self._parseLine(line.decode(charset))
            if (self._translation_file.header is not None
                    or self._message.msgid_singular):
                # Either found the header already or it's a message with a
                # non empty msgid which means is not a header.
                break
            line = self._getHeaderLine()

        if line is None:
            if (self._translation_file.header is None
                    and not self._message.msgid_singular):
                # This file contains no actual messages.
                self._dumpCurrentSection()

                # It may contain a header though.
                if not self._message.translations:
                    raise TranslationFormatSyntaxError(
                        message="File contains no messages.")
                self._parseHeader(
                    self._message.translations[
                        TranslationConstants.SINGULAR_FORM],
                    self._message.comment)

            # There is nothing left to parse.
            return self._translation_file

        # Parse anything left all in one go.
        lines = re.split(r'\n|\r\n|\r', self._pending_unichars)
        for line in lines:
            self._parseLine(line)

        if self._translation_file.header is None:
            raise TranslationFormatSyntaxError(
                message='No header found in this pofile')

        if self._message is not None:
            # We need to dump latest message.
            if self._section is None:
                # The message has not content or it's just a comment, ignore
                # it.
                return self._translation_file
            elif self._section == 'msgstr':
                self._dumpCurrentSection()
                self._storeCurrentMessage()
            else:
                raise TranslationFormatSyntaxError(
                    line_number=self._lineno,
                    message='Got a truncated message!')

        return self._translation_file
class POParser(object):
    """Parser class for Gettext files."""

    def __init__(self, plural_formula=None):
        self._translation_file = None
        self._lineno = 0
        # This is a default plural form mapping (i.e. no mapping) when
        # no header is present in the PO file.
        self._plural_form_mapping = make_plurals_identity_map()
        self._expected_plural_formula = plural_formula

        # Marks when we're parsing a continuation of a string after an escaped
        # newline.
        self._escaped_line_break = False

    def _emitSyntaxWarning(self, message):
        warning = POSyntaxWarning(message, line_number=self._lineno)
        if self._translation_file:
            self._translation_file.syntax_warnings.append(unicode(warning))

    def _decode(self):
        # is there anything to convert?
        if not self._pending_chars:
            return

        # if the PO header hasn't been parsed, then we don't know the
        # encoding yet
        if self._translation_file.header is None:
            return

        charset = self._translation_file.header.charset
        decode = codecs.getdecoder(charset)
        # decode as many characters as we can:
        try:
            newchars, length = decode(self._pending_chars, 'strict')
        except UnicodeDecodeError as exc:
            # XXX: James Henstridge 2006-03-16:
            # If the number of unconvertable chars is longer than a
            # multibyte sequence to be, the UnicodeDecodeError indicates
            # a real error, rather than a partial read.
            # I don't know what the longest multibyte sequence in the
            # encodings we need to support, but it shouldn't be more
            # than 10 bytes ...
            if len(self._pending_chars) - exc.start > 10:
                raise TranslationFormatInvalidInputError(
                    line_number=self._lineno,
                    message="Could not decode input from %s" % charset)
            newchars, length = decode(self._pending_chars[:exc.start],
                                      'strict')
        self._pending_unichars += newchars
        self._pending_chars = self._pending_chars[length:]

    def _getHeaderLine(self):
        if self._translation_file.header is not None:
            # We know what charset the data is in, as we've already
            # parsed the header.  However, we're going to handle this
            # more efficiently, so we don't want to use _getHeaderLine
            # except for parsing the header.
            raise AssertionError(
                'using _getHeaderLine after header is parsed')

        # We don't know what charset the data is in, so we parse it one line
        # at a time until we have the header, and then we'll know how to
        # treat the rest of the data.
        parts = re.split(r'\n|\r\n|\r', self._pending_chars, 1)
        if len(parts) == 1:
            # only one line
            return None
        line, self._pending_chars = parts
        return line.strip()

    def parse(self, content_text):
        """Parse string as a PO file."""
        # Initialize the parser.
        self._translation_file = TranslationFileData()
        self._messageids = set()
        self._pending_chars = content_text
        self._pending_unichars = u''
        self._lineno = 0
        # Message specific variables.
        self._message = TranslationMessageData()
        self._message_lineno = self._lineno
        self._section = None
        self._plural_case = None
        self._parsed_content = u''

        # First thing to do is to get the charset used in the content_text.
        charset = parse_charset(content_text)

        # Now, parse the header, inefficiently. It ought to be short, so
        # this isn't disastrous.
        line = self._getHeaderLine()
        while line is not None:
            self._parseLine(line.decode(charset))
            if (self._translation_file.header is not None or
                self._message.msgid_singular):
                # Either found the header already or it's a message with a
                # non empty msgid which means is not a header.
                break
            line = self._getHeaderLine()

        if line is None:
            if (self._translation_file.header is None and
                not self._message.msgid_singular):
                # This file contains no actual messages.
                self._dumpCurrentSection()

                # It may contain a header though.
                if not self._message.translations:
                    raise TranslationFormatSyntaxError(
                        message="File contains no messages.")
                self._parseHeader(
                    self._message.translations[
                        TranslationConstants.SINGULAR_FORM],
                    self._message.comment)

            # There is nothing left to parse.
            return self._translation_file

        # Parse anything left all in one go.
        lines = re.split(r'\n|\r\n|\r', self._pending_unichars)
        for line in lines:
            self._parseLine(line)

        if self._translation_file.header is None:
            raise TranslationFormatSyntaxError(
                message='No header found in this pofile')

        if self._message is not None:
            # We need to dump latest message.
            if self._section is None:
                # The message has not content or it's just a comment, ignore
                # it.
                return self._translation_file
            elif self._section == 'msgstr':
                self._dumpCurrentSection()
                self._storeCurrentMessage()
            else:
                raise TranslationFormatSyntaxError(
                    line_number = self._lineno,
                    message='Got a truncated message!')

        return self._translation_file

    def _storeCurrentMessage(self):
        if self._message is not None:
            msgkey = self._message.msgid_singular
            if self._message.context is not None:
                msgkey = '%s\2%s' % (self._message.context, msgkey)
            if msgkey in self._messageids:
                # We use '%r' instead of '%d' because there are situations
                # when it returns an "<unprintable instance object>". You can
                # see more details on bug #2896
                raise TranslationFormatInvalidInputError(
                    message='PO file: duplicate msgid ending on line %r' % (
                        self._message_lineno))

            number_plural_forms = (
                self._translation_file.header.number_plural_forms)
            if (self._message.msgid_plural and
                len(self._message.translations) < number_plural_forms):
                # Has plural forms but the number of translations is lower.
                # Fill the others with an empty string.
                for index in range(
                    len(self._message.translations), number_plural_forms):
                    self._message.addTranslation(index, u'')

            self._translation_file.messages.append(self._message)
            self._messageids.add(msgkey)
            self._message = None

    def _parseHeader(self, header_text, header_comment):
        try:
            header = POHeader(header_text, header_comment)
            self._translation_file.header = header
            self._translation_file.syntax_warnings += header.syntax_warnings
        except TranslationFormatInvalidInputError as error:
            if error.line_number is None:
                error.line_number = self._message_lineno
            raise
        self._translation_file.header.is_fuzzy = (
            'fuzzy' in self._message.flags)

        if self._translation_file.messages:
            self._emitSyntaxWarning("Header entry is not first entry.")

        plural_formula = self._translation_file.header.plural_form_expression
        if plural_formula is None:
            # We default to a simple plural formula which uses
            # a single form for translations.
            plural_formula = '0'
        self._plural_form_mapping = plural_form_mapper(
            plural_formula, self._expected_plural_formula)
        # convert buffered input to the encoding specified in the PO header
        self._decode()

    def _unescapeNumericCharSequence(self, string):
        """Unescape leading sequence of escaped numeric character codes.

        This is for characters given in hexadecimal or octal escape notation.

        :return: a tuple: first, any leading part of `string` as an unescaped
            string (empty if `string` did not start with a numeric escape
            sequence), and second, the remainder of `string` after the leading
            numeric escape sequences have been parsed.
        """
        escaped_string = ''
        position = 0
        length = len(string)
        while position + 1 < length and string[position] == '\\':
            # Handle escaped characters given as numeric character codes.
            # These will still be in the original encoding.  We extract the
            # whole sequence of escaped chars to recode them later into
            # Unicode in a single call.
            lead_char = string[position + 1]
            if lead_char == 'x':
                # Hexadecimal escape.
                position += 4
            elif lead_char.isdigit():
                # Octal escape.
                position += 2
                # Up to two more octal digits.
                for i in xrange(2):
                    if string[position].isdigit():
                        position += 1
                    else:
                        break
            elif lead_char in ESCAPE_MAP:
                # It's part of our mapping table, we ignore it here.
                break
            else:
                raise TranslationFormatSyntaxError(
                    line_number=self._lineno,
                    message=("Unknown escape sequence %s" %
                             string[position:position + 2]))

        if position == 0:
            # No escaping to be done.
            return '', string

        # We found some text escaped that should be recoded to Unicode.
        # First, we unescape it.
        escaped_string, string = string[:position], string[position:]
        unescaped_string = escaped_string.decode('string-escape')

        if (self._translation_file is not None and
            self._translation_file.header is not None):
            # There is a header, so we know the original encoding for
            # the given string.
            charset = self._translation_file.header.charset
            know_charset = True
        else:
            # We don't know the original encoding of the imported file so we
            # cannot get the right values.  We try ASCII.
            # XXX JeroenVermeulen 2008-02-08: might as well try UTF-8 here.
            # It's a superset, and anything that's not UTF-8 is very unlikely
            # to validate as UTF-8.
            charset = 'ascii'
            know_charset = False

        try:
            decoded_text = unescaped_string.decode(charset)
        except UnicodeDecodeError:
            if know_charset:
                message = ("Could not decode escaped string as %s: (%s)"
                           % (charset, escaped_string))
            else:
                message = ("Could not decode escaped string: (%s)"
                           % escaped_string)
            raise TranslationFormatInvalidInputError(
                line_number=self._lineno, message=message)

        return decoded_text, string

    def _parseQuotedString(self, string):
        r"""Parse a quoted string, interpreting escape sequences.

          >>> parser = POParser()
          >>> parser._parseQuotedString(u'\"abc\"')
          u'abc'
          >>> parser._parseQuotedString(u'\"abc\\ndef\"')
          u'abc\ndef'
          >>> parser._parseQuotedString(u'\"ab\x63\"')
          u'abc'
          >>> parser._parseQuotedString(u'\"ab\143\"')
          u'abc'

          After the string has been converted to unicode, the backslash
          escaped sequences are still in the encoding that the charset header
          specifies. Such quoted sequences will be converted to unicode by
          this method.

          We don't know the encoding of the escaped characters and cannot be
          just recoded as Unicode so it's a TranslationFormatInvalidInputError
          >>> utf8_string = u'"view \\302\\253${version_title}\\302\\273"'
          >>> parser._parseQuotedString(utf8_string)
          Traceback (most recent call last):
          ...
          TranslationFormatInvalidInputError: Could not decode escaped string: (\302\253)

          Now, we note the original encoding so we get the right Unicode
          string.

          >>> class FakeHeader:
          ...     charset = 'UTF-8'
          >>> parser._translation_file = TranslationFileData()
          >>> parser._translation_file.header = FakeHeader()
          >>> parser._parseQuotedString(utf8_string)
          u'view \xab${version_title}\xbb'

          Let's see that we raise a TranslationFormatInvalidInputError
          exception when we have an escaped char that is not valid in the
          declared encoding of the original string:

          >>> iso8859_1_string = u'"foo \\xf9"'
          >>> parser._parseQuotedString(iso8859_1_string)
          Traceback (most recent call last):
          ...
          TranslationFormatInvalidInputError: Could not decode escaped string as UTF-8: (\xf9)

          An error will be raised if the entire string isn't contained in
          quotes properly:

          >>> parser._parseQuotedString(u'abc')
          Traceback (most recent call last):
            ...
          TranslationFormatSyntaxError: String is not quoted
          >>> parser._parseQuotedString(u'\"ab')
          Traceback (most recent call last):
            ...
          TranslationFormatSyntaxError: String not terminated
          >>> parser._parseQuotedString(u'\"ab\"x')
          Traceback (most recent call last):
            ...
          TranslationFormatSyntaxError: Extra content found after string: (x)
        """
        if self._escaped_line_break:
            # Continuing a line after an escaped newline.  Strip indentation.
            string = string.lstrip()
            self._escaped_line_break = False
        else:
            # Regular string.  Must start with opening quote, which we strip.
            if string[0] != '"':
                raise TranslationFormatSyntaxError(
                    line_number=self._lineno, message="String is not quoted")
            string = string[1:]

        output = ''
        while len(string) > 0:
            if string[0] == '"':
                # Reached the end of the quoted string.  It's rare, but there
                # may be another quoted string on the same line.  It should be
                # suffixed to what we already have, with any whitespace
                # between the strings removed.
                string = string[1:].lstrip()
                if len(string) == 0:
                    # End of line, end of string: the normal case
                    break
                if string[0] == '"':
                    # Start of a new string.  We've already swallowed the
                    # closing quote and any intervening whitespace; now
                    # swallow the re-opening quote and go on as if the string
                    # just went on normally
                    string = string[1:]
                    continue

                # if there is any non-string data afterwards, raise an
                # exception
                if len(string) > 0 and not string.isspace():
                    raise TranslationFormatSyntaxError(
                        line_number=self._lineno,
                        message=("Extra content found after string: (%s)" %
                                 string))
                break
            elif string[0] == '\\':
                if len(string) == 1:
                    self._escaped_line_break = True
                    string = ''
                    break
                elif string[1] in ESCAPE_MAP:
                    # We got one of the special escaped chars we know about.
                    # Unescape it using the mapping table.
                    output += ESCAPE_MAP[string[1]]
                    string = string[2:]
                else:
                    unescaped, string = (
                        self._unescapeNumericCharSequence(string))
                    output += unescaped
            else:
                # Normal text.  Eat up as much as we can in one go.
                text = re.match(STRAIGHT_TEXT_RUN, string)
                output += text.group()
                zero, runlength = text.span()
                string = string[runlength:]
        else:
            # We finished parsing the string without finding the ending quote
            # char.
            raise TranslationFormatSyntaxError(
                line_number=self._lineno, message="String not terminated")

        return output

    def _dumpCurrentSection(self):
        """Dump current parsed content inside the translation message."""
        if self._section is None:
            # There is nothing to dump.
            return
        elif self._section == 'msgctxt':
            self._message.context = self._parsed_content
        elif self._section == 'msgid':
            self._message.msgid_singular = self._parsed_content
        elif self._section == 'msgid_plural':
            self._message.msgid_plural = self._parsed_content
            # Note in the header that there are plural forms.
            self._translation_file.header.has_plural_forms = True
        elif self._section == 'msgstr':
            if self._message.msgid_plural is not None:
                self._message.addTranslation(
                    self._plural_form_mapping[self._plural_case],
                    self._parsed_content)
            else:
                self._message.addTranslation(
                    self._plural_case,
                    self._parsed_content)
        else:
            raise AssertionError('Unknown section %s' % self._section)

        self._parsed_content = u''

    def _parseFreshLine(self, line, original_line):
        """Parse a new line (not a continuation after escaped newline).

        :param line: Remaining part of input line.
        :param original_line: Line as it originally was on input.
        :return: If there is one, the first line of a quoted string belonging
            to the line's section.  Otherwise, None.
        """
        is_obsolete = False
        if line.startswith('#~'):
            if line.startswith('#~|'):
                # This is an old msgid for an obsolete message.
                return None
            else:
                is_obsolete = True
                line = line[2:].lstrip()
                if len(line) == 0:
                    return None

        # If we get a comment line after a msgstr or a line starting with
        # msgid or msgctxt, this is a new entry.
        if ((line.startswith('#') or line.startswith('msgid') or
            line.startswith('msgctxt')) and self._section == 'msgstr'):
            if self._message is None:
                # first entry - do nothing.
                pass
            elif self._message.msgid_singular:
                self._dumpCurrentSection()
                self._storeCurrentMessage()
            elif self._translation_file.header is None:
                # When there is no msgid in the parsed message, it's the
                # header for this file.
                self._dumpCurrentSection()
                self._parseHeader(
                    self._message.translations[
                        TranslationConstants.SINGULAR_FORM],
                    self._message.comment)
            else:
                self._emitSyntaxWarning("We got a second header.")

            # Start a new message.
            self._message = TranslationMessageData()
            self._message_lineno = self._lineno
            self._section = None
            self._plural_case = None
            self._parsed_content = u''

        if self._message is not None:
            # Record whether the message is obsolete.
            self._message.is_obsolete = is_obsolete

        if line[0] == '#':
            # Record flags
            if line[:2] == '#,':
                new_flags = [flag.strip() for flag in line[2:].split(',')]
                self._message.flags.update(new_flags)
                return None
            # Record file references
            if line[:2] == '#:':
                if self._message.file_references:
                    # There is already a file reference, let's split it from
                    # the new one with a new line char.
                    self._message.file_references += '\n'
                self._message.file_references += line[2:].strip()
                return None
            # Record source comments
            if line[:2] == '#.':
                self._message.source_comment += line[2:].strip() + '\n'
                return None
            # Record comments
            self._message.comment += line[1:] + '\n'
            return None

        # Now we are in a msgctxt or msgid section, output previous section
        if line.startswith('msgid_plural'):
            if self._section != 'msgid':
                raise TranslationFormatSyntaxError(
                    line_number=self._lineno,
                    message="Unexpected keyword: msgid_plural")
            self._dumpCurrentSection()
            self._section = 'msgid_plural'
            line = line[len('msgid_plural'):]
        elif line.startswith('msgctxt'):
            if (self._section is not None and
                (self._section == 'msgctxt' or
                 self._section.startswith('msgid'))):
                raise TranslationFormatSyntaxError(
                    line_number=self._lineno,
                    message="Unexpected keyword: msgctxt")
            self._section = 'msgctxt'
            line = line[len('msgctxt'):]
        elif line.startswith('msgid'):
            if (self._section is not None and
                self._section.startswith('msgid')):
                raise TranslationFormatSyntaxError(
                    line_number=self._lineno,
                    message="Unexpected keyword: msgid")
            if self._section is not None:
                self._dumpCurrentSection()
            self._section = 'msgid'
            line = line[len('msgid'):]
            self._plural_case = None
        # Now we are in a msgstr section
        elif line.startswith('msgstr'):
            self._dumpCurrentSection()
            self._section = 'msgstr'
            line = line[len('msgstr'):]
            # XXX kiko 2005-08-19: if line is empty, it means we got an msgstr
            # followed by a newline; that may be critical, but who knows?
            if line.startswith('['):
                # Plural case
                new_plural_case, line = line[1:].split(']', 1)

                try:
                    new_plural_case = int(new_plural_case)
                except ValueError:
                    # Trigger "invalid plural case number" error.
                    new_plural_case = -1

                if new_plural_case < 0:
                    raise TranslationFormatSyntaxError(
                        line_number=self._lineno,
                        message="Invalid plural case number.")
                elif new_plural_case >= TranslationConstants.MAX_PLURAL_FORMS:
                    raise TranslationFormatSyntaxError(
                        line_number=self._lineno,
                        message="Unsupported plural case number.")

                if (self._plural_case is not None) and (
                        new_plural_case != self._plural_case + 1):
                    self._emitSyntaxWarning("Bad plural case number.")
                if new_plural_case != self._plural_case:
                    self._plural_case = new_plural_case
                else:
                    self._emitSyntaxWarning(
                        "msgstr[] repeats same plural case number.")
            else:
                self._plural_case = TranslationConstants.SINGULAR_FORM
        elif self._section is None:
            raise TranslationFormatSyntaxError(
                line_number=self._lineno,
                message='Invalid content: %r' % original_line)
        else:
            # This line could be the continuation of a previous section.
            pass

        line = line.strip()
        if len(line) == 0:
            self._emitSyntaxWarning(
                "Line has no content; this is not supported by some "
                "implementations of msgfmt.")
        return line

    def _parseLine(self, original_line):
        self._lineno += 1
        # Skip empty lines
        line = original_line.strip()
        if len(line) == 0:
            return

        if not self._escaped_line_break:
            line = self._parseFreshLine(line, original_line)
            if line is None or len(line) == 0:
                return

        line = self._parseQuotedString(line)

        text_section_types = ('msgctxt', 'msgid', 'msgid_plural', 'msgstr')
        if self._section not in text_section_types:
            raise TranslationFormatSyntaxError(
                line_number=self._lineno,
                message='Invalid content: %r' % original_line)

        self._parsed_content += line 
    def testIsIdenticalTranslation(self):
        """Test `is_identical_translation`."""
        msg1 = TranslationMessageData()
        msg2 = TranslationMessageData()
        msg1.msgid_singular = "foo"
        msg2.msgid_singular = "foo"

        self.assertTrue(is_identical_translation(msg1, msg2),
            "Two blank translation messages do not evaluate as identical.")

        msg1.msgid_plural = "foos"
        self.assertFalse(is_identical_translation(msg1, msg2),
            "Message with fewer plural forms is accepted as identical.")
        msg2.msgid_plural = "splat"
        self.assertFalse(is_identical_translation(msg1, msg2),
            "Messages with different plurals accepted as identical.")
        msg2.msgid_plural = "foos"
        self.assertTrue(is_identical_translation(msg1, msg2),
            "Messages with identical plural forms not accepted as identical.")

        msg1._translations = ["le foo"]
        self.assertFalse(is_identical_translation(msg1, msg2),
            "Failed to distinguish translated message from untranslated one.")
        msg2._translations = ["le foo"]
        self.assertTrue(is_identical_translation(msg1, msg2),
            "Identical translations not accepted as identical.")

        msg1._translations = ["le foo", "les foos"]
        self.assertFalse(is_identical_translation(msg1, msg2),
            "Failed to distinguish message with missing plural translation.")
        msg2._translations = ["le foo", "les foos"]
        self.assertTrue(is_identical_translation(msg1, msg2),
            "Identical plural translations not accepted as equal.")

        msg1._translations = ["le foo", "les foos", "beaucoup des foos"]
        self.assertFalse(is_identical_translation(msg1, msg2),
            "Failed to distinguish message with extra plural translations.")
        msg2._translations = ["le foo", "les foos", "beaucoup des foos", None]
        self.assertTrue(is_identical_translation(msg1, msg2),
            "Identical multi-form messages not accepted as identical.")