def get_units(events, srx_handler=None): keep_spaces = False keep_spaces_level = 0 for type, value, line in _get_translatable_blocks(events): if type == START_ELEMENT: tag_uri, tag_name, attributes = value # Attributes for attr_uri, attr_name in attributes: datatype = get_attr_datatype(tag_uri, tag_name, attr_uri, attr_name, attributes) if not issubclass(datatype, Unicode): continue value = attributes[(attr_uri, attr_name)] if not value.strip(): continue unit = ((srx_TEXT, value),) yield (unit, _get_attr_context(datatype, tag_name, attr_name), line) # Keep spaces ? schema = get_element_schema(tag_uri, tag_name) if schema.keep_spaces: keep_spaces = True keep_spaces_level += 1 elif type == END_ELEMENT: # Keep spaces ? tag_uri, tag_name = value schema = get_element_schema(tag_uri, tag_name) if schema.keep_spaces: keep_spaces_level -= 1 if keep_spaces_level == 0: keep_spaces = False elif type == MESSAGE: # Segmentation for segment in get_segments(value, keep_spaces, srx_handler): yield segment
def get_units(events, srx_handler=None): keep_spaces = False keep_spaces_level = 0 for type, value, line in _get_translatable_blocks(events): if type == START_ELEMENT: tag_uri, tag_name, attributes = value # Attributes for attr_uri, attr_name in attributes: datatype = get_attr_datatype(tag_uri, tag_name, attr_uri, attr_name, attributes) if not issubclass(datatype, Unicode): continue value = attributes[(attr_uri, attr_name)] if not value.strip(): continue unit = ((srx_TEXT, value), ) yield (unit, _get_attr_context(datatype, tag_name, attr_name), line) # Keep spaces ? schema = get_element_schema(tag_uri, tag_name) if schema.keep_spaces: keep_spaces = True keep_spaces_level += 1 elif type == END_ELEMENT: # Keep spaces ? tag_uri, tag_name = value schema = get_element_schema(tag_uri, tag_name) if schema.keep_spaces: keep_spaces_level -= 1 if keep_spaces_level == 0: keep_spaces = False elif type == MESSAGE: # Segmentation for segment in get_segments(value, keep_spaces, srx_handler): yield segment
def test_word(self): message = Message() message.append_text('Hello. ') segments = [] for seg, context, offset in get_segments(message): segments.append(seg) self.assertEqual(segments, [((TEXT, u'Hello.'),)])
def test_abrevations(self): # 1 text = u'This is Toto Inc. a big company.' result = [((TEXT, u'This is Toto Inc. a big company.'),)] message = Message() message.append_text(text) segments = [] for seg, context, offset in get_segments(message): segments.append(seg) self.assertEqual(segments, result) # 2 text = u'Mr. From' result = [((TEXT, u'Mr. From'),)] message = Message() message.append_text(text) segments = [] for seg, context, offset in get_segments(message): segments.append(seg) self.assertEqual(segments, result)
def test_tab(self): text = '\n\t This folder is empty.\n\t ' result = [((TEXT, u'This folder is empty.'),)] message = Message() message.append_text(text) segments = [] for seg, context, offset in get_segments(message): segments.append(seg) self.assertEqual(segments, result)
def test_number(self): text = u'The 12.54 and 12,54 and 152.' result = [((TEXT, u'The 12.54 and 12,54 and 152.'),)] message = Message() message.append_text(text) segments = [] for seg, context, offset in get_segments(message): segments.append(seg) self.assertEqual(segments, result)
def test_unknown_abrevations(self): text = u'E.T. is beautiful.' result = [((TEXT, u'E.T. is beautiful.'),)] message = Message() message.append_text(text) segments = [] for seg, context, offset in get_segments(message): segments.append(seg) self.assertEqual(segments, result)
def test_simple(self): text = u'This is a sentence. A very little sentence.' result =[((TEXT, u'This is a sentence.'),), ((TEXT, u'A very little sentence.'),)] message = Message() message.append_text(text) segments = [] for seg, context, offset in get_segments(message): segments.append(seg) self.assertEqual(segments, result)
def test_between_number(self): text = u'Price: -12.25 Euro.' result = [((TEXT, u'Price:'),), ((TEXT, u'-12.25 Euro.'),)] message = Message() message.append_text(text) segments = [] for seg, context, offset in get_segments(message): segments.append(seg) self.assertEqual(segments, result)
def test_single_character(self): text = u'I am T. From.' result = [((TEXT, u'I am T. From.'),)] message = Message() message.append_text(text) segments = [] for seg, context, offset in get_segments(message): segments.append(seg) self.assertEqual(segments, result)
def test_punctuation(self): text = u'A Ph.D in mathematics?!!!!' result = [((TEXT, u'A Ph.D in mathematics?!!!!'),)] message = Message() message.append_text(text) segments = [] for seg, context, offset in get_segments(message): segments.append(seg) self.assertEqual(segments, result)
def test_etc(self): text = u'A lot of animals... And no man' result = [((TEXT, u'A lot of animals...'),), ((TEXT, u'And no man'),)] message = Message() message.append_text(text) segments = [] for seg, context, offset in get_segments(message): segments.append(seg) self.assertEqual(segments, result)
def test_semicolon(self): text = 'Write to the Free Software Foundation; we sometimes make' \ ' exceptions for this.' result = [((TEXT, u'Write to the Free Software Foundation;'),), ((TEXT, u'we sometimes make exceptions for this.'),)] message = Message() message.append_text(text) segments = [] for seg, context, offset in get_segments(message): segments.append(seg) self.assertEqual(segments, result)
def test_newline(self): text = 'And you must show them these terms so they know their\n' \ 'rights.\n' result = [((TEXT, u'And you must show them these terms so they know their rights.'),)] message = Message() message.append_text(text) segments = [] for seg, context, offset in get_segments(message): segments.append(seg) self.assertEqual(segments, result)
def test_parentheses2(self): text = '(Hereinafter, translation is included without limitation' \ ' in the term "modification".) Each licensee is addressed' \ ' as "you".' result = [((TEXT, u'(Hereinafter, translation is included without ' u'limitation in the term "modification".) Each ' u'licensee is addressed as "you".'),)] message = Message() message.append_text(text) segments = [] for seg, context, offset in get_segments(message): segments.append(seg) self.assertEqual(segments, result)
def test_raw_text(self): text = u'This is raw text. Every characters must be kept. ' \ u'1 space 2 spaces 3 spaces 1 newline\nend.' expected = [((TEXT, u'This is raw text.'),), ((TEXT, u'Every characters must be kept.'),), ((TEXT, u'1 space 2 spaces 3 spaces 1 newline\nend.'),) ] message = Message() message.append_text(text) segments = [] for seg, context, offset in get_segments(message, keep_spaces=True): segments.append(seg) self.assertEqual(segments, expected)
def test_parentheses1(self): text = ( '(Exception: if the Program itself is interactive but does not' ' normally print such an announcement, your work based on the' ' Program is not required to print an announcement.) ') result = [((TEXT, u'(Exception:'),), ((TEXT, u'if the Program itself is interactive but does ' u'not normally print such an announcement, your ' u'work based on the Program is not required to ' u'print an announcement.)'),)] message = Message() message.append_text(text) segments = [] for seg, context, offset in get_segments(message): segments.append(seg) self.assertEqual(segments, result)