Example #1
0
File: utils.py Project: CCLab/sezam
def clean_text_for_search(text):
    """
    Prepare text for indexing and search.
    """
    # Get the normalized unicode text.
    text= force_unicode(text).strip()

    # Remove e-mail quotation from the beginnings of the string.
    text= re.sub(r'^\>+', '', text)

    # Remove e-mail addresses.
    text= re.sub(r'\b[A-Za-z0-9_\.-]+@[A-Za-z0-9_\.-]+[A-Za-z0-9_][A-Za-z0-9_]\b', '', text)

    # Try to convert html to text.
    try:
        text= html2text(text)
    except:
        pass

    # Clean the text from special characters, such as
    # section divisions ***, etc. but preserve punctuation.
    text= re.sub(r'\B\W{2,}\B', ' ', text)

    # Remove all returns and new lines.
    text= re.sub(r'\n+', ' ', text)
    text= re.sub(r'\r+', ' ', text)

    # Convert multiple spaces to singles.
    text= re.sub(r'\s{2,}', ' ', text)

    return text
Example #2
0
 def extract_mail_content(self, message_data, **kwargs):
     """
     Returns text message content.
     """
     msg_plain_text, msg_attachments= '', []
     for response_part in message_data:
         if isinstance(response_part, tuple):
             msg= email.message_from_string(response_part[1])
             for part in msg.walk():
                 if part.is_multipart():
                     continue
                 attachment_part= part.get_params(None, 'Content-Disposition')
                 if attachment_part:
                     attachment_size= len(part.get_payload(decode=True))
                     attachment_name= self._process_attachment(part, **kwargs)
                     if attachment_name:
                         msg_attachments.append({'filename': attachment_name,
                                                 'filesize': attachment_size})
                 else:
                     # Process message text.
                     # Update `msg_plain_text` only if it isn't updated yet.
                     if len(msg_plain_text) == 0:
                         if str(part.get_content_type()) == 'text/plain':
                             msg_plain_text= unicode(part.get_payload(decode=True),
                                                     part.get_content_charset(), 'ignore').encode('utf8','replace')
                         elif str(part.get_content_type()) == 'text/html':
                             msg_plain_text= unicode(html2text(part.get_payload(decode=True)),
                                                     part.get_content_charset(), 'ignore').encode('utf8','replace')
     return msg_plain_text, msg_attachments