def guessCharsetAndConvert(document, text_content, content_type): """ return encoded content_type and message if encoding is not utf-8 """ codec = guessEncodingFromText(text_content, content_type) if codec is not None: try: text_content = text_content.decode(codec).encode('utf-8') except (UnicodeDecodeError, LookupError): message = 'Conversion to base format with codec %r fails' % codec # try again with another guesser based on file command codec = guessEncodingFromText(text_content, 'text/plain') if codec is not None: try: text_content = text_content.decode(codec).encode('utf-8') except (UnicodeDecodeError, LookupError): message = 'Conversion to base format with codec %r fails'\ % codec else: message = 'Conversion to base format with codec %r succeeds'\ % codec else: message = 'Conversion to base format with codec %r succeeds'\ % codec else: message = 'Conversion to base format without codec fails' return text_content, message
def getContentInformation(self): """ Returns the content information from the header information. This is used by the metadata discovery system. Header information is converted in UTF-8 since this is the standard way of representing strings in ERP5. """ result = {} for (name, value) in self._getMessage().items(): try: decoded_header = decode_header(value) except HeaderParseError, error_message: decoded_header = () LOG('EmailDocument.getContentInformation', INFO, 'Failed to decode %s header of %s with error: %s' % (name, self.getPath(), error_message)) for text, encoding in decoded_header: try: if encoding is not None: text = text.decode(encoding).encode('utf-8') else: text = text.decode().encode('utf-8') except (UnicodeDecodeError, LookupError), error_message: encoding = guessEncodingFromText(text, content_type='text/plain') if encoding is not None: try: text = text.decode(encoding).encode('utf-8') except (UnicodeDecodeError, LookupError), error_message: text = repr(text)[1:-1] else: text = repr(text)[1:-1]
def getTextContent(self, default=_MARKER): """ Returns the content of the email as text. This is useful to display the content of an email. According to rfc, (http://tools.ietf.org/html/rfc2046#section-5.1.4) getTextContent should return html part of multipart/alternative couple If multipart/mixed, the html part is an attachement. So return the main content (text/plain). """ self._checkConversionFormatPermission(None) if not self.hasFile(): # Return the standard text content if no file was provided # Or standard text content is not empty. if default is _MARKER: return self._baseGetTextContent() else: return self._baseGetTextContent(default) else: part = self._getMessageTextPart() if part is None: text_result = "" else: part_encoding = part.get_content_charset() message_text = part.get_payload(decode=1) if part.get_content_type() == 'text/html': mime, text_result = self.convert(format='html', text_content=message_text, charset=part_encoding) else: if part_encoding != 'utf-8': try: if part_encoding is not None: text_result = message_text.decode(part_encoding).encode('utf-8') else: text_result = message_text.decode().encode('utf-8') except (UnicodeDecodeError, LookupError), error_message: LOG('EmailDocument.getTextContent', INFO, 'Failed to decode %s TEXT message of %s with error: %s' % (part_encoding, self.getPath(), error_message)) codec = guessEncodingFromText(message_text, content_type=part.get_content_type()) if codec is not None: try: text_result = message_text.decode(codec).encode('utf-8') except (UnicodeDecodeError, LookupError): text_result = repr(message_text) else: text_result = repr(message_text) else: text_result = message_text
def testCharsetAndConvert(text_content, content_type, encoding): try: if encoding is not None: text_content = text_content.decode(encoding).encode('utf-8') else: text_content = text_content.decode().encode('utf-8') except (UnicodeDecodeError, LookupError), error_message: encoding = guessEncodingFromText(text_content, content_type) if encoding is not None: try: text_content = text_content.decode(encoding).encode('utf-8') except (UnicodeDecodeError, LookupError): text_content = repr(text_content)[1:-1] else: text_content = repr(text_content)[1:-1]
def _guessEncoding(self, string, mime='text/html'): """ Deprecated method """ return guessEncodingFromText(string, content_type=mime)