def is_an_email_from_ext(fname, extset=EMAILEXTS): """ Tests if a file is an email based on the files extension. This is only a rough guess and should be combined with other methods. Extensions in extset, should be lower case. """ return auto_unicode_dang_it(fname).lower().endswith(tuple(extset))
def document_to_text(filepath, okext=OKEXT): ext = get_file_suffixes(filepath).lower() if ext in okext: try: parsefunc = BFILEHANDLEDICT[ext] text = parsefunc(filepath) except KeyError: text = auto_textract(filepath) if text: return auto_unicode_dang_it(text) return u''
def parse_multi_layer_file(uri, txt=None, ftype=None, okext=OKEXT): """ Can handle files that contain files, e.g. emails with attachments. Returns a list with parsed files each in a dict tree. type: is ether the extension or a set definition, e.g. email. """ if isinstance(txt, NoneType): with open(uri) as fogj: txt = fogj.read() if is_an_email(uri, text=txt): parsedtxtlist = email_whole_parse(uri=uri, text=txt) info, mime, ftype = get_file_info_from_buffer(txt) for i, parsedtxt in enumerate(parsedtxtlist): if i == 0: # the zero gen file does not return with a file name parsedtxt['filename'] = uri emlparsed = [(fit_into_data_mold(parseddict=parsedtxt, txt=txt, uri=auto_unicode_dang_it(uri), ftype=u'email', mime=mime, info=info))] else: attchtxt = parsedtxt['body'] if not attchtxt: attchtxt = u'' info, mime, ftype = get_file_info_from_buffer(attchtxt) fname = parsedtxt['filename'] for parsedbit in parse_multi_layer_file(uri=fname, txt=attchtxt, ftype=ftype, okext=okext): emlparsed.append(parsedbit) return emlparsed parsedtxt = parse_binary(string=txt, fname=uri) if not ftype: ftype = uri.split('.')[-1] if not parsedtxt: parseddict = {u'body': u''} else: parseddict = {u'body': parsedtxt} info, mime, ftype = get_file_info_from_buffer(txt) parseddict[u'filename'] = sane_unicode(uri) return [fit_into_data_mold(parseddict=parseddict, txt=txt, uri=uri, ftype=ftype, mime=mime, info=info)]
def parse_binary(string=None, fname=None, suffix=None, okext=OKEXT, tryagain=True, **xargs): """ Parse a binary file or string. """ stringbool = bool(string) if (not stringbool) and bool(fname) and (get_file_suffixes(fname) in okext): return parse_binary_from_file(fname) elif fname and (not suffix) and stringbool: suffix = auto_unicode_dang_it('.' + fname.split('.')[-1]).encode('ascii') elif (not suffix) and stringbool: suffix = guess_ext_from_mime(string) else: if (suffix not in okext) or (fname and (get_file_suffixes(fname) not in okext)): return None else: raise ValueError('Did not provide string or fname') if suffix.lower() not in okext: if not fname: fname = '' return None prsd = parse_binary_from_string(string=string, suffix=suffix) if tryagain and not (len(prsd) > 0): try: extbymime = guess_ext_from_mime(string) except KeyError: extbymime = None if extbymime and (extbymime.lower() in okext): try: return parse_binary_from_string(string, suffix=extbymime) except ValueError: LOG.debug('body len=0, and mime ' + 'derived ext resulted in ValueError, giving up.\t' + 'Supplied ext:\t' + suffix + '\t' + 'Mime derived ext:\t' + str(extbymime) + '\t' + 'Filename:\t' + str(fname)) else: pass else: pass return prsd
def atch_fname_from_dispositions(dispositions): for param in dispositions[1:]: try: label, name = param.split(b"=") except (ValueError): label, name, ext = param.split(b"=") name = name + ext if param: p = param else: p = b"Param==None" if name: v = name else: v = b"Name==None" if b"filename" in label: LOG.debug(b"EmailPath:\t" + p + "\t" + v) if b"filename" in label: name = auto_unicode_dang_it(name) name = name.strip().lower() name = name.strip(b"*").strip(b"utf-8''").replace(b"%20", b" ").strip(b'"') return name
def email_parse_attachment(msgpart): # TODO (steven_c) Make less complex. content_disposition = msgpart.get(b"Content-Disposition", None) if content_disposition: dispositions = content_disposition.strip().split(b";") if bool(content_disposition and dispositions[0].lower() == b"attachment"): filedata = msgpart.get_payload() try: if b"base64" in msgpart.get(b"Content-Transfer-Encoding", None).lower(): filedata = b64decode(filedata) except (AttributeError, TypeError): return None fname = auto_unicode_dang_it(msgpart.get_filename()) if match(b"(Untitled)(.{0,3})(attachment)(.{0,10})(\.txt)", fname): filedata = u"" attachment = { u"body": filedata, u"type": msgpart.get_content_type(), u"filename": fname, # fyi, this is a filename not pointer. } attachment[u"filename"] = atch_fname_from_dispositions(dispositions) return attachment return None
def email_parse(content, extraheaders=EXTRA_HEADERS, extraaddress_headers=EXTRA_ADDRESS_HEADERS): """ Returns unicode. Converts 'Date' to UTC. """ p = Parser() msgobj = p.parsestr(str(content)) if msgobj["Subject"] is not None: decodefrag = decode_header(msgobj["Subject"]) subj_fragments = [] for s, enc in decodefrag: if enc: s = auto_unicode_dang_it(s, enc) subj_fragments.append(s) subject = "".join(subj_fragments) else: subject = u"" attachments = [] body_text = u"" for part in msgobj.walk(): attachment = email_parse_attachment(part) if attachment: attachments.append(attachment) elif part.get_content_type() == "text/plain": bodypayload = part.get_payload(decode=True) charset = part.get_content_charset() if not charset: charset = "utf-8" if bodypayload: body_text += auto_unicode_dang_it(bodypayload, charset) elif (not body_text) and (part.get_content_type() == "text/html"): htmlpayload = part.get_payload(decode=True) if htmlpayload: body_text += auto_unicode_dang_it(htmlpayload, part.get_content_charset(), "replace") try: try: datetime = sane_unicode(normize_dtime_tmzn_nrth_am(msgobj["date"])) except (TypeError): datetime = None msgbits = { u"subject": auto_unicode_dang_it(subject), u"body": body_text, # 'body_html': body_html, u"from": tuple([auto_unicode_dang_it(addr) for addr in parseaddr(msgobj.get("From"))]), u"attachment": attch_stats_from_attchdict(attachments), u"datetime": datetime, } except ValueError: LOG.critical("Could not parse required headers") raise ValueError("Was not able to parse all required email headers.") if extraaddress_headers: for field in extraaddress_headers: try: msgbits[field] = tuple( [ tuple([auto_unicode_dang_it(person) for person in parseaddr(bit)]) for bit in msgobj[field].split(",") ] ) except (KeyError, AttributeError): msgbits[field] = tuple([(u"", u"")]) if extraheaders: for field in extraheaders: try: msgbits[field] = auto_unicode_dang_it(msgobj[field]) except (KeyError, AttributeError, ValueError): msgbits[field] = u"" return msgbits, attachments