def extract_headers_metadata(self, headers): self.result.headers = safe_dict(dict(headers)) headers = [(safe_string(k), safe_string(v)) for k, v in headers] for field, value in headers: field = field.lower() if field is None or value is None: continue if field == 'subject': self.update('title', value) if field == 'message-id': self.update('message_id', value) if field == 'in-reply-to': self.result.emit_in_reply_to(value) if field == 'references': for email_addr in value.split(): self.result.emit_in_reply_to(email_addr) if field == 'date': date = value try: date = email.utils.parsedate(date) date = datetime.fromtimestamp(mktime(date)) self.update('created_at', date) except Exception as ex: log.warning("Failed to parse [%s]: %s", date, ex) if field == 'from': for (name, _) in self.parse_emails(value): self.update('author', name) if field in ['to', 'cc', 'bcc']: self.parse_emails(value)
def ingest(self, file_path): self.result.flag(self.result.FLAG_EMAIL) try: doc = self.parse_xml(file_path) except TypeError: raise ProcessingException("Cannot parse OPF XML file.") if len(doc.findall('//email')) != 1: raise ProcessingException("More than one email in file.") email = doc.find('//email') props = email.getchildren() props = {c.tag: safe_string(c.text) for c in props if c.text} headers = { 'Subject': props.get('OPFMessageCopySubject'), 'Message-ID': props.pop('OPFMessageCopyMessageID', None), 'From': self.get_contacts(email, 'OPFMessageCopyFromAddresses'), 'Sender': self.get_contacts(email, 'OPFMessageCopySenderAddress'), 'To': self.get_contacts(email, 'OPFMessageCopyToAddresses'), 'CC': self.get_contacts(email, 'OPFMessageCopyCCAddresses'), 'BCC': self.get_contacts(email, 'OPFMessageCopyBCCAddresses'), } date = props.get('OPFMessageCopySentTime') if date is not None: date = datetime.strptime(date, '%Y-%m-%dT%H:%M:%S') date = time.mktime(date.timetuple()) headers['Date'] = utils.formatdate(date) self.result.headers = safe_dict(headers) self.update('title', props.pop('OPFMessageCopySubject', None)) self.update('title', props.pop('OPFMessageCopyThreadTopic', None)) for tag in ('OPFMessageCopyFromAddresses', 'OPFMessageCopySenderAddress'): self.update('author', self.get_contact_name(email, tag)) self.update('summary', props.pop('OPFMessageCopyPreview', None)) self.update('created_at', props.pop('OPFMessageCopySentTime', None)) self.update('modified_at', props.pop('OPFMessageCopyModDate', None)) body = props.pop('OPFMessageCopyBody', None) html = props.pop('OPFMessageCopyHTMLBody', None) has_html = '1E0' == props.pop('OPFMessageGetHasHTML', None) if has_html and safe_string(html): self.extract_html_content(html) self.result.flag(self.result.FLAG_HTML) else: self.extract_plain_text_content(body) self.result.flag(self.result.FLAG_PLAINTEXT)
def _parse_headers(self, message): headers = message.getField('007D') if headers is not None: try: message = Parser().parsestr(headers, headersonly=True) self.extract_headers_metadata(message.items()) return except Exception: log.warning("Cannot parse headers: %s" % headers) self.result.headers = safe_dict({ 'Subject': message.getField('0037'), 'BCC': message.getField('0E02'), 'CC': message.getField('0E03'), 'To': message.getField('0E04'), 'From': message.getField('1046'), 'Message-ID': message.getField('1035'), })