Exemple #1
0
def get_units(events, srx_handler=None):
    keep_spaces = False
    keep_spaces_level = 0
    for type, value, line in _get_translatable_blocks(events):
        if type == START_ELEMENT:
            tag_uri, tag_name, attributes = value
            # Attributes
            for attr_uri, attr_name in attributes:
                datatype = get_attr_datatype(tag_uri, tag_name, attr_uri,
                                             attr_name, attributes)
                if not issubclass(datatype, Unicode):
                    continue
                value = attributes[(attr_uri, attr_name)]
                if not value.strip():
                    continue
                unit = ((srx_TEXT, value),)
                yield (unit, _get_attr_context(datatype, tag_name, attr_name),
                       line)
            # Keep spaces ?
            schema = get_element_schema(tag_uri, tag_name)
            if schema.keep_spaces:
                keep_spaces = True
                keep_spaces_level += 1
        elif type == END_ELEMENT:
            # Keep spaces ?
            tag_uri, tag_name = value
            schema = get_element_schema(tag_uri, tag_name)
            if schema.keep_spaces:
                keep_spaces_level -= 1
                if keep_spaces_level == 0:
                    keep_spaces = False
        elif type == MESSAGE:
            # Segmentation
            for segment in get_segments(value, keep_spaces, srx_handler):
                yield segment
Exemple #2
0
def get_units(events, srx_handler=None):
    keep_spaces = False
    keep_spaces_level = 0
    for type, value, line in _get_translatable_blocks(events):
        if type == START_ELEMENT:
            tag_uri, tag_name, attributes = value
            # Attributes
            for attr_uri, attr_name in attributes:
                datatype = get_attr_datatype(tag_uri, tag_name, attr_uri,
                                             attr_name, attributes)
                if not issubclass(datatype, Unicode):
                    continue
                value = attributes[(attr_uri, attr_name)]
                if not value.strip():
                    continue
                unit = ((srx_TEXT, value), )
                yield (unit, _get_attr_context(datatype, tag_name,
                                               attr_name), line)
            # Keep spaces ?
            schema = get_element_schema(tag_uri, tag_name)
            if schema.keep_spaces:
                keep_spaces = True
                keep_spaces_level += 1
        elif type == END_ELEMENT:
            # Keep spaces ?
            tag_uri, tag_name = value
            schema = get_element_schema(tag_uri, tag_name)
            if schema.keep_spaces:
                keep_spaces_level -= 1
                if keep_spaces_level == 0:
                    keep_spaces = False
        elif type == MESSAGE:
            # Segmentation
            for segment in get_segments(value, keep_spaces, srx_handler):
                yield segment
Exemple #3
0
    def test_word(self):
        message = Message()
        message.append_text('Hello. ')

        segments = []
        for seg, context, offset in get_segments(message):
            segments.append(seg)
        self.assertEqual(segments, [((TEXT, u'Hello.'),)])
Exemple #4
0
    def test_word(self):
        message = Message()
        message.append_text('Hello. ')

        segments = []
        for seg, context, offset in get_segments(message):
            segments.append(seg)
        self.assertEqual(segments, [((TEXT, u'Hello.'),)])
Exemple #5
0
 def test_abrevations(self):
     # 1
     text = u'This is Toto Inc. a big company.'
     result = [((TEXT, u'This is Toto Inc. a big company.'),)]
     message = Message()
     message.append_text(text)
     segments = []
     for seg, context, offset in get_segments(message):
         segments.append(seg)
     self.assertEqual(segments, result)
     # 2
     text = u'Mr. From'
     result =  [((TEXT, u'Mr. From'),)]
     message = Message()
     message.append_text(text)
     segments = []
     for seg, context, offset in get_segments(message):
         segments.append(seg)
     self.assertEqual(segments, result)
Exemple #6
0
 def test_abrevations(self):
     # 1
     text = u'This is Toto Inc. a big company.'
     result = [((TEXT, u'This is Toto Inc. a big company.'),)]
     message = Message()
     message.append_text(text)
     segments = []
     for seg, context, offset in get_segments(message):
         segments.append(seg)
     self.assertEqual(segments, result)
     # 2
     text = u'Mr. From'
     result =  [((TEXT, u'Mr. From'),)]
     message = Message()
     message.append_text(text)
     segments = []
     for seg, context, offset in get_segments(message):
         segments.append(seg)
     self.assertEqual(segments, result)
Exemple #7
0
    def test_tab(self):
        text = '\n\t   This folder is empty.\n\t   '
        result = [((TEXT, u'This folder is empty.'),)]

        message = Message()
        message.append_text(text)
        segments = []
        for seg, context, offset in get_segments(message):
            segments.append(seg)

        self.assertEqual(segments, result)
Exemple #8
0
    def test_number(self):
        text = u'The 12.54 and 12,54 and 152.'
        result = [((TEXT, u'The 12.54 and 12,54 and 152.'),)]

        message = Message()
        message.append_text(text)
        segments = []
        for seg, context, offset in get_segments(message):
            segments.append(seg)

        self.assertEqual(segments, result)
Exemple #9
0
    def test_unknown_abrevations(self):
        text = u'E.T. is beautiful.'
        result = [((TEXT, u'E.T. is beautiful.'),)]

        message = Message()
        message.append_text(text)
        segments = []
        for seg, context, offset in get_segments(message):
            segments.append(seg)

        self.assertEqual(segments, result)
Exemple #10
0
    def test_simple(self):
        text = u'This is a sentence. A very little sentence.'
        result =[((TEXT, u'This is a sentence.'),),
                 ((TEXT, u'A very little sentence.'),)]

        message = Message()
        message.append_text(text)
        segments = []
        for seg, context, offset in get_segments(message):
            segments.append(seg)
        self.assertEqual(segments, result)
Exemple #11
0
    def test_tab(self):
        text = '\n\t   This folder is empty.\n\t   '
        result = [((TEXT, u'This folder is empty.'),)]

        message = Message()
        message.append_text(text)
        segments = []
        for seg, context, offset in get_segments(message):
            segments.append(seg)

        self.assertEqual(segments, result)
Exemple #12
0
    def test_between_number(self):
        text = u'Price: -12.25 Euro.'
        result = [((TEXT, u'Price:'),), ((TEXT, u'-12.25 Euro.'),)]

        message = Message()
        message.append_text(text)
        segments = []
        for seg, context, offset in get_segments(message):
            segments.append(seg)

        self.assertEqual(segments, result)
Exemple #13
0
    def test_single_character(self):
        text = u'I am T. From.'
        result = [((TEXT, u'I am T. From.'),)]

        message = Message()
        message.append_text(text)
        segments = []
        for seg, context, offset in get_segments(message):
            segments.append(seg)

        self.assertEqual(segments, result)
Exemple #14
0
    def test_punctuation(self):
        text = u'A Ph.D in          mathematics?!!!!'
        result =  [((TEXT, u'A Ph.D in mathematics?!!!!'),)]

        message = Message()
        message.append_text(text)
        segments = []
        for seg, context, offset in get_segments(message):
            segments.append(seg)

        self.assertEqual(segments, result)
Exemple #15
0
    def test_etc(self):
        text = u'A lot of animals... And no man'
        result = [((TEXT, u'A lot of animals...'),), ((TEXT, u'And no man'),)]

        message = Message()
        message.append_text(text)
        segments = []
        for seg, context, offset in get_segments(message):
            segments.append(seg)

        self.assertEqual(segments, result)
Exemple #16
0
    def test_number(self):
        text = u'The 12.54 and 12,54 and 152.'
        result = [((TEXT, u'The 12.54 and 12,54 and 152.'),)]

        message = Message()
        message.append_text(text)
        segments = []
        for seg, context, offset in get_segments(message):
            segments.append(seg)

        self.assertEqual(segments, result)
Exemple #17
0
    def test_etc(self):
        text = u'A lot of animals... And no man'
        result = [((TEXT, u'A lot of animals...'),), ((TEXT, u'And no man'),)]

        message = Message()
        message.append_text(text)
        segments = []
        for seg, context, offset in get_segments(message):
            segments.append(seg)

        self.assertEqual(segments, result)
Exemple #18
0
    def test_between_number(self):
        text = u'Price: -12.25 Euro.'
        result = [((TEXT, u'Price:'),), ((TEXT, u'-12.25 Euro.'),)]

        message = Message()
        message.append_text(text)
        segments = []
        for seg, context, offset in get_segments(message):
            segments.append(seg)

        self.assertEqual(segments, result)
Exemple #19
0
    def test_unknown_abrevations(self):
        text = u'E.T. is beautiful.'
        result = [((TEXT, u'E.T. is beautiful.'),)]

        message = Message()
        message.append_text(text)
        segments = []
        for seg, context, offset in get_segments(message):
            segments.append(seg)

        self.assertEqual(segments, result)
Exemple #20
0
    def test_single_character(self):
        text = u'I am T. From.'
        result = [((TEXT, u'I am T. From.'),)]

        message = Message()
        message.append_text(text)
        segments = []
        for seg, context, offset in get_segments(message):
            segments.append(seg)

        self.assertEqual(segments, result)
Exemple #21
0
    def test_simple(self):
        text = u'This is a sentence. A very little sentence.'
        result =[((TEXT, u'This is a sentence.'),),
                 ((TEXT, u'A very little sentence.'),)]

        message = Message()
        message.append_text(text)
        segments = []
        for seg, context, offset in get_segments(message):
            segments.append(seg)
        self.assertEqual(segments, result)
Exemple #22
0
    def test_punctuation(self):
        text = u'A Ph.D in          mathematics?!!!!'
        result =  [((TEXT, u'A Ph.D in mathematics?!!!!'),)]

        message = Message()
        message.append_text(text)
        segments = []
        for seg, context, offset in get_segments(message):
            segments.append(seg)

        self.assertEqual(segments, result)
Exemple #23
0
    def test_semicolon(self):
        text = 'Write to the Free Software Foundation; we sometimes make' \
               ' exceptions for this.'
        result =  [((TEXT, u'Write to the Free Software Foundation;'),),
                   ((TEXT, u'we sometimes make exceptions for this.'),)]

        message = Message()
        message.append_text(text)
        segments = []
        for seg, context, offset in get_segments(message):
            segments.append(seg)

        self.assertEqual(segments, result)
Exemple #24
0
    def test_semicolon(self):
        text = 'Write to the Free Software Foundation; we sometimes make' \
               ' exceptions for this.'
        result =  [((TEXT, u'Write to the Free Software Foundation;'),),
                   ((TEXT, u'we sometimes make exceptions for this.'),)]

        message = Message()
        message.append_text(text)
        segments = []
        for seg, context, offset in get_segments(message):
            segments.append(seg)

        self.assertEqual(segments, result)
Exemple #25
0
    def test_newline(self):
        text = 'And you must show them these terms so they know their\n' \
               'rights.\n'
        result = [((TEXT,
          u'And you must show them these terms so they know their rights.'),)]

        message = Message()
        message.append_text(text)
        segments = []
        for seg, context, offset in get_segments(message):
            segments.append(seg)

        self.assertEqual(segments, result)
Exemple #26
0
    def test_newline(self):
        text = 'And you must show them these terms so they know their\n' \
               'rights.\n'
        result = [((TEXT,
          u'And you must show them these terms so they know their rights.'),)]

        message = Message()
        message.append_text(text)
        segments = []
        for seg, context, offset in get_segments(message):
            segments.append(seg)

        self.assertEqual(segments, result)
Exemple #27
0
    def test_parentheses2(self):
        text = '(Hereinafter, translation is included without limitation' \
               ' in the term "modification".)  Each licensee is addressed' \
               ' as "you".'
        result = [((TEXT, u'(Hereinafter, translation is included without '
                          u'limitation in the term "modification".) Each '
                          u'licensee is addressed as "you".'),)]

        message = Message()
        message.append_text(text)
        segments = []
        for seg, context, offset in get_segments(message):
            segments.append(seg)

        self.assertEqual(segments, result)
Exemple #28
0
    def test_raw_text(self):
        text = u'This is raw text. Every characters must be kept. ' \
               u'1 space 2 spaces  3 spaces   1 newline\nend.'
        expected = [((TEXT, u'This is raw text.'),),
                    ((TEXT, u'Every characters must be kept.'),),
                    ((TEXT, u'1 space 2 spaces  3 spaces   1 newline\nend.'),)
                    ]

        message = Message()
        message.append_text(text)
        segments = []
        for seg, context, offset in get_segments(message, keep_spaces=True):
            segments.append(seg)

        self.assertEqual(segments, expected)
Exemple #29
0
    def test_raw_text(self):
        text = u'This is raw text. Every characters must be kept. ' \
               u'1 space 2 spaces  3 spaces   1 newline\nend.'
        expected = [((TEXT, u'This is raw text.'),),
                    ((TEXT, u'Every characters must be kept.'),),
                    ((TEXT, u'1 space 2 spaces  3 spaces   1 newline\nend.'),)
                    ]

        message = Message()
        message.append_text(text)
        segments = []
        for seg, context, offset in get_segments(message, keep_spaces=True):
            segments.append(seg)

        self.assertEqual(segments, expected)
Exemple #30
0
    def test_parentheses2(self):
        text = '(Hereinafter, translation is included without limitation' \
               ' in the term "modification".)  Each licensee is addressed' \
               ' as "you".'
        result = [((TEXT, u'(Hereinafter, translation is included without '
                          u'limitation in the term "modification".) Each '
                          u'licensee is addressed as "you".'),)]

        message = Message()
        message.append_text(text)
        segments = []
        for seg, context, offset in get_segments(message):
            segments.append(seg)

        self.assertEqual(segments, result)
Exemple #31
0
    def test_parentheses1(self):
        text = (
            '(Exception: if the Program itself is interactive but does not'
            ' normally print such an announcement, your work based on the'
            ' Program is not required to print an announcement.)  ')
        result = [((TEXT, u'(Exception:'),),
                  ((TEXT, u'if the Program itself is interactive but does '
                          u'not normally print such an announcement, your '
                          u'work based on the Program is not required to '
                          u'print an announcement.)'),)]

        message = Message()
        message.append_text(text)
        segments = []
        for seg, context, offset in get_segments(message):
            segments.append(seg)

        self.assertEqual(segments, result)
Exemple #32
0
    def test_parentheses1(self):
        text = (
            '(Exception: if the Program itself is interactive but does not'
            ' normally print such an announcement, your work based on the'
            ' Program is not required to print an announcement.)  ')
        result = [((TEXT, u'(Exception:'),),
                  ((TEXT, u'if the Program itself is interactive but does '
                          u'not normally print such an announcement, your '
                          u'work based on the Program is not required to '
                          u'print an announcement.)'),)]

        message = Message()
        message.append_text(text)
        segments = []
        for seg, context, offset in get_segments(message):
            segments.append(seg)

        self.assertEqual(segments, result)