def parse_multi_layer_file(uri, txt=None, ftype=None, okext=OKEXT): """ Can handle files that contain files, e.g. emails with attachments. Returns a list with parsed files each in a dict tree. type: is ether the extension or a set definition, e.g. email. """ if isinstance(txt, NoneType): with open(uri) as fogj: txt = fogj.read() if is_an_email(uri, text=txt): parsedtxtlist = email_whole_parse(uri=uri, text=txt) info, mime, ftype = get_file_info_from_buffer(txt) for i, parsedtxt in enumerate(parsedtxtlist): if i == 0: # the zero gen file does not return with a file name parsedtxt['filename'] = uri emlparsed = [(fit_into_data_mold(parseddict=parsedtxt, txt=txt, uri=auto_unicode_dang_it(uri), ftype=u'email', mime=mime, info=info))] else: attchtxt = parsedtxt['body'] if not attchtxt: attchtxt = u'' info, mime, ftype = get_file_info_from_buffer(attchtxt) fname = parsedtxt['filename'] for parsedbit in parse_multi_layer_file(uri=fname, txt=attchtxt, ftype=ftype, okext=okext): emlparsed.append(parsedbit) return emlparsed parsedtxt = parse_binary(string=txt, fname=uri) if not ftype: ftype = uri.split('.')[-1] if not parsedtxt: parseddict = {u'body': u''} else: parseddict = {u'body': parsedtxt} info, mime, ftype = get_file_info_from_buffer(txt) parseddict[u'filename'] = sane_unicode(uri) return [fit_into_data_mold(parseddict=parseddict, txt=txt, uri=uri, ftype=ftype, mime=mime, info=info)]
def test_sane_unicode4(): res = sane_unicode(TEST_STRING_4) assert res == u'"€"' assert isinstance(res, unicode)
def test_sane_unicode3(): res = sane_unicode(TEST_STRING_3) assert res == u'"más"' assert isinstance(res, unicode)
def test_sane_unicode1(): res = sane_unicode(TEST_STRING_1) assert res == u"If numbers aren't beautiful, I don't know what is. –Paul Erdős" assert isinstance(res, unicode)
def test_sane_unicode__with_unicode(): res = sane_unicode(u"monkey") assert res == u"monkey" assert isinstance(res, unicode)
def get_file_info_from_buffer(txt): # Consider putting in utils info = sane_unicode(from_buffer(txt)) mime = sane_unicode(from_buffer(txt, True)) ftype = sane_unicode(mime.split(u'/')[-1]) return info, mime, ftype # Consider using a namedtuple.
def email_parse(content, extraheaders=EXTRA_HEADERS, extraaddress_headers=EXTRA_ADDRESS_HEADERS): """ Returns unicode. Converts 'Date' to UTC. """ p = Parser() msgobj = p.parsestr(str(content)) if msgobj["Subject"] is not None: decodefrag = decode_header(msgobj["Subject"]) subj_fragments = [] for s, enc in decodefrag: if enc: s = auto_unicode_dang_it(s, enc) subj_fragments.append(s) subject = "".join(subj_fragments) else: subject = u"" attachments = [] body_text = u"" for part in msgobj.walk(): attachment = email_parse_attachment(part) if attachment: attachments.append(attachment) elif part.get_content_type() == "text/plain": bodypayload = part.get_payload(decode=True) charset = part.get_content_charset() if not charset: charset = "utf-8" if bodypayload: body_text += auto_unicode_dang_it(bodypayload, charset) elif (not body_text) and (part.get_content_type() == "text/html"): htmlpayload = part.get_payload(decode=True) if htmlpayload: body_text += auto_unicode_dang_it(htmlpayload, part.get_content_charset(), "replace") try: try: datetime = sane_unicode(normize_dtime_tmzn_nrth_am(msgobj["date"])) except (TypeError): datetime = None msgbits = { u"subject": auto_unicode_dang_it(subject), u"body": body_text, # 'body_html': body_html, u"from": tuple([auto_unicode_dang_it(addr) for addr in parseaddr(msgobj.get("From"))]), u"attachment": attch_stats_from_attchdict(attachments), u"datetime": datetime, } except ValueError: LOG.critical("Could not parse required headers") raise ValueError("Was not able to parse all required email headers.") if extraaddress_headers: for field in extraaddress_headers: try: msgbits[field] = tuple( [ tuple([auto_unicode_dang_it(person) for person in parseaddr(bit)]) for bit in msgobj[field].split(",") ] ) except (KeyError, AttributeError): msgbits[field] = tuple([(u"", u"")]) if extraheaders: for field in extraheaders: try: msgbits[field] = auto_unicode_dang_it(msgobj[field]) except (KeyError, AttributeError, ValueError): msgbits[field] = u"" return msgbits, attachments