def _process_single_node(self, node):
     """
     Extract the contents of a single XML dump node
     :param node: The XML node corresponding to a message
     :return: An EmailMessage instance containing the message contents
     """
     text = unicode(node.find('text').text)
     text = unicode.lstrip(text, u'>')  # remove leading char that got into the text somehow
     if use_full_parser(text):
         text = fix_broken_hotmail_headers(text)
         parser = Parser()
         mime_message = parser.parse(StringIO(text))
         return_message = get_nested_payload(mime_message)
     else:
         return_message = EmailMessage()
         subject_node = node.find('subject')
         from_node = node.find('from')
         to_node = node.find('to')
         date_node = node.find('receivedat')
         subject = unicode(subject_node.text, 'utf-8') if not subject_node is None else ''
         sender = clean_sender('{} <{}>'.format(from_node.find('name').text, from_node.find('email').text))
         recipient = clean_recipient('{} <{}>'.format(to_node.find('name').text, to_node.find('email').text))
         date_string = '{} {}'.format(date_node.find('date').text, date_node.find('time').text)
         return_message.append_body(unicode(text))
         return_message.subject = subject
         return_message.sender = sender
         return_message.recipient = recipient
         return_message.date = parse(date_string)
         return_message.date = normalize_to_utc(return_message.date, self._timezone)
     return_message.source = "XML File {} node {}".format(self._process_path, node.attrib)
     return return_message
 def _process_multipart_eml(file_path):
     """
     Given an EML file, clean it up, parse it, and extract
     the contents we want to keep.
     :param file_path: The path to the EML file to process
     :return: A structured EmailMessage instance
     """
     with codecs.open(file_path, 'rb', 'windows-1252') as text_file:
         text = unicode(''.join(text_file.readlines()))
         if use_full_parser(text):
             text = fix_broken_yahoo_headers(text)
         parser = Parser()
         mime_message = parser.parse(StringIO(text))
         return_message = get_nested_payload(mime_message)
         return_message.source = "EML File {}".format(file_path)
     return return_message
 def _process_multipart_eml(file_path):
     """
     Given an EML file, clean it up, parse it, and extract
     the contents we want to keep.
     :param file_path: The path to the EML file to process
     :return: A structured EmailMessage instance
     """
     with codecs.open(file_path, 'rb', 'windows-1252') as text_file:
         text = unicode(''.join(text_file.readlines()))
         if use_full_parser(text):
             text = fix_broken_yahoo_headers(text)
         parser = Parser()
         mime_message = parser.parse(StringIO(text))
         return_message = get_nested_payload(mime_message)
         return_message.source = "EML File {}".format(file_path)
     return return_message
Exemple #4
0
 def _process_single_node(self, node):
     """
     Extract the contents of a single XML dump node
     :param node: The XML node corresponding to a message
     :return: An EmailMessage instance containing the message contents
     """
     text = unicode(node.find('text').text)
     text = unicode.lstrip(
         text, u'>')  # remove leading char that got into the text somehow
     if use_full_parser(text):
         text = fix_broken_hotmail_headers(text)
         parser = Parser()
         mime_message = parser.parse(StringIO(text))
         return_message = get_nested_payload(mime_message)
     else:
         return_message = EmailMessage()
         subject_node = node.find('subject')
         from_node = node.find('from')
         to_node = node.find('to')
         date_node = node.find('receivedat')
         subject = unicode(subject_node.text,
                           'utf-8') if not subject_node is None else ''
         sender = clean_sender('{} <{}>'.format(
             from_node.find('name').text,
             from_node.find('email').text))
         recipient = clean_recipient('{} <{}>'.format(
             to_node.find('name').text,
             to_node.find('email').text))
         date_string = '{} {}'.format(
             date_node.find('date').text,
             date_node.find('time').text)
         return_message.append_body(unicode(text))
         return_message.subject = subject
         return_message.sender = sender
         return_message.recipient = recipient
         return_message.date = parse(date_string)
         return_message.date = normalize_to_utc(return_message.date,
                                                self._timezone)
     return_message.source = "XML File {} node {}".format(
         self._process_path, node.attrib)
     return return_message