def test_word(self): message = Message() message.append_text('Hello. ') segments = [] for seg, context, offset in get_segments(message): segments.append(seg) self.assertEqual(segments, [((TEXT, u'Hello.'),)])
def test_etc(self): text = u'A lot of animals... And no man' result = [((TEXT, u'A lot of animals...'),), ((TEXT, u'And no man'),)] message = Message() message.append_text(text) segments = [] for seg, context, offset in get_segments(message): segments.append(seg) self.assertEqual(segments, result)
def test_number(self): text = u'The 12.54 and 12,54 and 152.' result = [((TEXT, u'The 12.54 and 12,54 and 152.'),)] message = Message() message.append_text(text) segments = [] for seg, context, offset in get_segments(message): segments.append(seg) self.assertEqual(segments, result)
def test_punctuation(self): text = u'A Ph.D in mathematics?!!!!' result = [((TEXT, u'A Ph.D in mathematics?!!!!'),)] message = Message() message.append_text(text) segments = [] for seg, context, offset in get_segments(message): segments.append(seg) self.assertEqual(segments, result)
def test_single_character(self): text = u'I am T. From.' result = [((TEXT, u'I am T. From.'),)] message = Message() message.append_text(text) segments = [] for seg, context, offset in get_segments(message): segments.append(seg) self.assertEqual(segments, result)
def test_tab(self): text = '\n\t This folder is empty.\n\t ' result = [((TEXT, u'This folder is empty.'),)] message = Message() message.append_text(text) segments = [] for seg, context, offset in get_segments(message): segments.append(seg) self.assertEqual(segments, result)
def test_between_number(self): text = u'Price: -12.25 Euro.' result = [((TEXT, u'Price:'),), ((TEXT, u'-12.25 Euro.'),)] message = Message() message.append_text(text) segments = [] for seg, context, offset in get_segments(message): segments.append(seg) self.assertEqual(segments, result)
def test_unknown_abrevations(self): text = u'E.T. is beautiful.' result = [((TEXT, u'E.T. is beautiful.'),)] message = Message() message.append_text(text) segments = [] for seg, context, offset in get_segments(message): segments.append(seg) self.assertEqual(segments, result)
def test_simple(self): text = u'This is a sentence. A very little sentence.' result =[((TEXT, u'This is a sentence.'),), ((TEXT, u'A very little sentence.'),)] message = Message() message.append_text(text) segments = [] for seg, context, offset in get_segments(message): segments.append(seg) self.assertEqual(segments, result)
def test_newline(self): text = 'And you must show them these terms so they know their\n' \ 'rights.\n' result = [((TEXT, u'And you must show them these terms so they know their rights.'),)] message = Message() message.append_text(text) segments = [] for seg, context, offset in get_segments(message): segments.append(seg) self.assertEqual(segments, result)
def test_semicolon(self): text = 'Write to the Free Software Foundation; we sometimes make' \ ' exceptions for this.' result = [((TEXT, u'Write to the Free Software Foundation;'),), ((TEXT, u'we sometimes make exceptions for this.'),)] message = Message() message.append_text(text) segments = [] for seg, context, offset in get_segments(message): segments.append(seg) self.assertEqual(segments, result)
def test_parentheses2(self): text = '(Hereinafter, translation is included without limitation' \ ' in the term "modification".) Each licensee is addressed' \ ' as "you".' result = [((TEXT, u'(Hereinafter, translation is included without ' u'limitation in the term "modification".) Each ' u'licensee is addressed as "you".'),)] message = Message() message.append_text(text) segments = [] for seg, context, offset in get_segments(message): segments.append(seg) self.assertEqual(segments, result)
def test_raw_text(self): text = u'This is raw text. Every characters must be kept. ' \ u'1 space 2 spaces 3 spaces 1 newline\nend.' expected = [((TEXT, u'This is raw text.'),), ((TEXT, u'Every characters must be kept.'),), ((TEXT, u'1 space 2 spaces 3 spaces 1 newline\nend.'),) ] message = Message() message.append_text(text) segments = [] for seg, context, offset in get_segments(message, keep_spaces=True): segments.append(seg) self.assertEqual(segments, expected)
def test_parentheses1(self): text = ( '(Exception: if the Program itself is interactive but does not' ' normally print such an announcement, your work based on the' ' Program is not required to print an announcement.) ') result = [((TEXT, u'(Exception:'),), ((TEXT, u'if the Program itself is interactive but does ' u'not normally print such an announcement, your ' u'work based on the Program is not required to ' u'print an announcement.)'),)] message = Message() message.append_text(text) segments = [] for seg, context, offset in get_segments(message): segments.append(seg) self.assertEqual(segments, result)
def test_abrevations(self): # 1 text = u'This is Toto Inc. a big company.' result = [((TEXT, u'This is Toto Inc. a big company.'),)] message = Message() message.append_text(text) segments = [] for seg, context, offset in get_segments(message): segments.append(seg) self.assertEqual(segments, result) # 2 text = u'Mr. From' result = [((TEXT, u'Mr. From'),)] message = Message() message.append_text(text) segments = [] for seg, context, offset in get_segments(message): segments.append(seg) self.assertEqual(segments, result)
def _get_translatable_blocks(events): # Default value encoding = 'utf-8' # To identify the begin/end format id = 0 id_stack = [] context_stack = [None] stream = None message = Message() skip_level = 0 for event in events: type, value, line = event # Set the good encoding if type == XML_DECL: encoding = value[1] # And now, we catch only the good events elif type == START_ELEMENT: if skip_level > 0: skip_level += 1 if stream: stream.append(event) continue else: tag_uri, tag_name, attributes = value schema = get_element_schema(tag_uri, tag_name) # Context management if schema.context is not None: context_stack.append(schema.context) # Skip content ? if schema.skip_content: skip_level = 1 if id_stack: stream = [event] continue # Is inline ? elif schema.is_inline: id += 1 id_stack.append(id) start_format = _make_start_format(tag_uri, tag_name, attributes, encoding) message.append_start_format(start_format, id, line) continue elif id_stack: skip_level = 1 stream = [event] continue elif type == END_ELEMENT: if skip_level > 0: skip_level -= 1 if stream: stream.append(event) if skip_level == 0: id += 1 aux = stream_to_str(stream, encoding) aux = unicode(aux, encoding) aux = [(aux, False, context_stack[-1])] message.append_start_format(aux, id, line) message.append_end_format([], id, line) stream = None continue else: tag_uri, tag_name = value[:2] schema = get_element_schema(tag_uri, tag_name) # Context management if schema.context is not None: context_stack.pop() # Is inline ? if schema.is_inline: message.append_end_format([(get_end_tag(value), False, None)], id_stack.pop(), line) continue elif type == TEXT: # Not empty ? if stream: stream.append(event) continue elif skip_level == 0 and (value.strip() != '' or message): value = XMLContent.encode(value) value = unicode(value, encoding) message.append_text(value, line, context_stack[-1]) continue elif type == COMMENT: if stream: stream.append(event) continue elif message: id += 1 if isinstance(value, str): value = unicode(value, encoding) value = u'<!--%s-->' % value message.append_start_format([(value, False, None)], id, line) message.append_end_format([], id, line) continue # Not a good event => break + send the event if message: yield MESSAGE, message, message.get_line() message = Message() yield event # Send the last message! if message: yield MESSAGE, message, message.get_line()