Example #1
0
def is_an_email_from_ext(fname, extset=EMAILEXTS):
    """
    Tests if a file is an email based on the files extension.
    This is only a rough guess and should be combined with other
    methods.

    Extensions in extset, should be lower case.
    """
    return auto_unicode_dang_it(fname).lower().endswith(tuple(extset))
Example #2
0
def document_to_text(filepath, okext=OKEXT):
    ext = get_file_suffixes(filepath).lower()
    if ext in okext:
        try:
            parsefunc = BFILEHANDLEDICT[ext]
            text = parsefunc(filepath)
        except KeyError:
            text = auto_textract(filepath)
        if text:
            return auto_unicode_dang_it(text)
    return u''
Example #3
0
def parse_multi_layer_file(uri, txt=None, ftype=None, okext=OKEXT):
    """
    Can handle files that contain files, e.g. emails with attachments.
    Returns a list with parsed files each in a dict tree.

        type: is ether the extension or a set definition, e.g. email.
    """
    if isinstance(txt, NoneType):
        with open(uri) as fogj:
            txt = fogj.read()
    if is_an_email(uri, text=txt):
        parsedtxtlist = email_whole_parse(uri=uri,
                                          text=txt)
        info, mime, ftype = get_file_info_from_buffer(txt)
        for i, parsedtxt in enumerate(parsedtxtlist):
            if i == 0:
                # the zero gen file does not return with a file name
                parsedtxt['filename'] = uri
                emlparsed = [(fit_into_data_mold(parseddict=parsedtxt,
                                                 txt=txt,
                                                 uri=auto_unicode_dang_it(uri),
                                                 ftype=u'email',
                                                 mime=mime,
                                                 info=info))]
            else:
                attchtxt = parsedtxt['body']
                if not attchtxt:
                    attchtxt = u''
                info, mime, ftype = get_file_info_from_buffer(attchtxt)
                fname = parsedtxt['filename']
                for parsedbit in parse_multi_layer_file(uri=fname,
                                                        txt=attchtxt,
                                                        ftype=ftype,
                                                        okext=okext):
                    emlparsed.append(parsedbit)
        return emlparsed
    parsedtxt = parse_binary(string=txt,
                             fname=uri)
    if not ftype:
        ftype = uri.split('.')[-1]
    if not parsedtxt:
        parseddict = {u'body': u''}
    else:
        parseddict = {u'body': parsedtxt}
    info, mime, ftype = get_file_info_from_buffer(txt)
    parseddict[u'filename'] = sane_unicode(uri)
    return [fit_into_data_mold(parseddict=parseddict,
                               txt=txt,
                               uri=uri,
                               ftype=ftype,
                               mime=mime,
                               info=info)]
Example #4
0
def parse_binary(string=None, fname=None, suffix=None, okext=OKEXT,
                 tryagain=True, **xargs):
    """
    Parse a binary file or string.
    """
    stringbool = bool(string)
    if (not stringbool) and bool(fname) and (get_file_suffixes(fname) in okext):
        return parse_binary_from_file(fname)
    elif fname and (not suffix) and stringbool:
        suffix = auto_unicode_dang_it('.' +
                                      fname.split('.')[-1]).encode('ascii')
    elif (not suffix) and stringbool:
        suffix = guess_ext_from_mime(string)
    else:
        if (suffix not in okext) or (fname and
                                     (get_file_suffixes(fname) not in okext)):
            return None
        else:
            raise ValueError('Did not provide string or fname')

    if suffix.lower() not in okext:
        if not fname:
            fname = ''
        return None
    prsd = parse_binary_from_string(string=string, suffix=suffix)
    if tryagain and not (len(prsd) > 0):
        try:
            extbymime = guess_ext_from_mime(string)
        except KeyError:
            extbymime = None
        if extbymime and (extbymime.lower() in okext):
            try:
                return parse_binary_from_string(string,
                                                suffix=extbymime)
            except ValueError:
                LOG.debug('body len=0, and mime ' +
                          'derived ext resulted in ValueError, giving up.\t' +
                          'Supplied ext:\t' + suffix + '\t' +
                          'Mime derived ext:\t' + str(extbymime) + '\t' +
                          'Filename:\t' + str(fname))
        else:
            pass
    else:
        pass
    return prsd
Example #5
0
def atch_fname_from_dispositions(dispositions):
    for param in dispositions[1:]:
        try:
            label, name = param.split(b"=")
        except (ValueError):
            label, name, ext = param.split(b"=")
            name = name + ext
            if param:
                p = param
            else:
                p = b"Param==None"
            if name:
                v = name
            else:
                v = b"Name==None"
            if b"filename" in label:
                LOG.debug(b"EmailPath:\t" + p + "\t" + v)
        if b"filename" in label:
            name = auto_unicode_dang_it(name)
            name = name.strip().lower()
            name = name.strip(b"*").strip(b"utf-8''").replace(b"%20", b" ").strip(b'"')
            return name
Example #6
0
def email_parse_attachment(msgpart):  # TODO (steven_c) Make less complex.
    content_disposition = msgpart.get(b"Content-Disposition", None)
    if content_disposition:
        dispositions = content_disposition.strip().split(b";")
        if bool(content_disposition and dispositions[0].lower() == b"attachment"):
            filedata = msgpart.get_payload()
            try:
                if b"base64" in msgpart.get(b"Content-Transfer-Encoding", None).lower():
                    filedata = b64decode(filedata)
            except (AttributeError, TypeError):
                return None
            fname = auto_unicode_dang_it(msgpart.get_filename())
            if match(b"(Untitled)(.{0,3})(attachment)(.{0,10})(\.txt)", fname):
                filedata = u""

            attachment = {
                u"body": filedata,
                u"type": msgpart.get_content_type(),
                u"filename": fname,
                # fyi, this is a filename not pointer.
            }
            attachment[u"filename"] = atch_fname_from_dispositions(dispositions)
            return attachment
    return None
Example #7
0
def email_parse(content, extraheaders=EXTRA_HEADERS, extraaddress_headers=EXTRA_ADDRESS_HEADERS):
    """
    Returns unicode.

    Converts 'Date' to UTC.
    """
    p = Parser()
    msgobj = p.parsestr(str(content))
    if msgobj["Subject"] is not None:
        decodefrag = decode_header(msgobj["Subject"])
        subj_fragments = []
        for s, enc in decodefrag:
            if enc:
                s = auto_unicode_dang_it(s, enc)
            subj_fragments.append(s)
        subject = "".join(subj_fragments)
    else:
        subject = u""
    attachments = []
    body_text = u""
    for part in msgobj.walk():
        attachment = email_parse_attachment(part)
        if attachment:
            attachments.append(attachment)
        elif part.get_content_type() == "text/plain":
            bodypayload = part.get_payload(decode=True)
            charset = part.get_content_charset()
            if not charset:
                charset = "utf-8"
            if bodypayload:
                body_text += auto_unicode_dang_it(bodypayload, charset)
        elif (not body_text) and (part.get_content_type() == "text/html"):
            htmlpayload = part.get_payload(decode=True)
            if htmlpayload:
                body_text += auto_unicode_dang_it(htmlpayload, part.get_content_charset(), "replace")
    try:
        try:
            datetime = sane_unicode(normize_dtime_tmzn_nrth_am(msgobj["date"]))
        except (TypeError):
            datetime = None
        msgbits = {
            u"subject": auto_unicode_dang_it(subject),
            u"body": body_text,
            # 'body_html': body_html,
            u"from": tuple([auto_unicode_dang_it(addr) for addr in parseaddr(msgobj.get("From"))]),
            u"attachment": attch_stats_from_attchdict(attachments),
            u"datetime": datetime,
        }
    except ValueError:
        LOG.critical("Could not parse required headers")
        raise ValueError("Was not able to parse all required email headers.")
    if extraaddress_headers:
        for field in extraaddress_headers:
            try:
                msgbits[field] = tuple(
                    [
                        tuple([auto_unicode_dang_it(person) for person in parseaddr(bit)])
                        for bit in msgobj[field].split(",")
                    ]
                )
            except (KeyError, AttributeError):
                msgbits[field] = tuple([(u"", u"")])
    if extraheaders:
        for field in extraheaders:
            try:
                msgbits[field] = auto_unicode_dang_it(msgobj[field])
            except (KeyError, AttributeError, ValueError):
                msgbits[field] = u""
    return msgbits, attachments