Exemple #1
0
def parse_multi_layer_file(uri, txt=None, ftype=None, okext=OKEXT):
    """
    Can handle files that contain files, e.g. emails with attachments.
    Returns a list with parsed files each in a dict tree.

        type: is ether the extension or a set definition, e.g. email.
    """
    if isinstance(txt, NoneType):
        with open(uri) as fogj:
            txt = fogj.read()
    if is_an_email(uri, text=txt):
        parsedtxtlist = email_whole_parse(uri=uri,
                                          text=txt)
        info, mime, ftype = get_file_info_from_buffer(txt)
        for i, parsedtxt in enumerate(parsedtxtlist):
            if i == 0:
                # the zero gen file does not return with a file name
                parsedtxt['filename'] = uri
                emlparsed = [(fit_into_data_mold(parseddict=parsedtxt,
                                                 txt=txt,
                                                 uri=auto_unicode_dang_it(uri),
                                                 ftype=u'email',
                                                 mime=mime,
                                                 info=info))]
            else:
                attchtxt = parsedtxt['body']
                if not attchtxt:
                    attchtxt = u''
                info, mime, ftype = get_file_info_from_buffer(attchtxt)
                fname = parsedtxt['filename']
                for parsedbit in parse_multi_layer_file(uri=fname,
                                                        txt=attchtxt,
                                                        ftype=ftype,
                                                        okext=okext):
                    emlparsed.append(parsedbit)
        return emlparsed
    parsedtxt = parse_binary(string=txt,
                             fname=uri)
    if not ftype:
        ftype = uri.split('.')[-1]
    if not parsedtxt:
        parseddict = {u'body': u''}
    else:
        parseddict = {u'body': parsedtxt}
    info, mime, ftype = get_file_info_from_buffer(txt)
    parseddict[u'filename'] = sane_unicode(uri)
    return [fit_into_data_mold(parseddict=parseddict,
                               txt=txt,
                               uri=uri,
                               ftype=ftype,
                               mime=mime,
                               info=info)]
def test_sane_unicode4():
    res = sane_unicode(TEST_STRING_4)
    assert res == u'"€"'
    assert isinstance(res, unicode)
def test_sane_unicode3():
    res = sane_unicode(TEST_STRING_3)
    assert res == u'"más"'
    assert isinstance(res, unicode)
def test_sane_unicode1():
    res = sane_unicode(TEST_STRING_1)
    assert res == u"If numbers aren't beautiful, I don't know what is. –Paul Erdős"
    assert isinstance(res, unicode)
def test_sane_unicode__with_unicode():
    res = sane_unicode(u"monkey")
    assert res == u"monkey"
    assert isinstance(res, unicode)
Exemple #6
0
def get_file_info_from_buffer(txt):  # Consider putting in utils
    info = sane_unicode(from_buffer(txt))
    mime = sane_unicode(from_buffer(txt, True))
    ftype = sane_unicode(mime.split(u'/')[-1])
    return info, mime, ftype  # Consider using a namedtuple.
Exemple #7
0
def email_parse(content, extraheaders=EXTRA_HEADERS, extraaddress_headers=EXTRA_ADDRESS_HEADERS):
    """
    Returns unicode.

    Converts 'Date' to UTC.
    """
    p = Parser()
    msgobj = p.parsestr(str(content))
    if msgobj["Subject"] is not None:
        decodefrag = decode_header(msgobj["Subject"])
        subj_fragments = []
        for s, enc in decodefrag:
            if enc:
                s = auto_unicode_dang_it(s, enc)
            subj_fragments.append(s)
        subject = "".join(subj_fragments)
    else:
        subject = u""
    attachments = []
    body_text = u""
    for part in msgobj.walk():
        attachment = email_parse_attachment(part)
        if attachment:
            attachments.append(attachment)
        elif part.get_content_type() == "text/plain":
            bodypayload = part.get_payload(decode=True)
            charset = part.get_content_charset()
            if not charset:
                charset = "utf-8"
            if bodypayload:
                body_text += auto_unicode_dang_it(bodypayload, charset)
        elif (not body_text) and (part.get_content_type() == "text/html"):
            htmlpayload = part.get_payload(decode=True)
            if htmlpayload:
                body_text += auto_unicode_dang_it(htmlpayload, part.get_content_charset(), "replace")
    try:
        try:
            datetime = sane_unicode(normize_dtime_tmzn_nrth_am(msgobj["date"]))
        except (TypeError):
            datetime = None
        msgbits = {
            u"subject": auto_unicode_dang_it(subject),
            u"body": body_text,
            # 'body_html': body_html,
            u"from": tuple([auto_unicode_dang_it(addr) for addr in parseaddr(msgobj.get("From"))]),
            u"attachment": attch_stats_from_attchdict(attachments),
            u"datetime": datetime,
        }
    except ValueError:
        LOG.critical("Could not parse required headers")
        raise ValueError("Was not able to parse all required email headers.")
    if extraaddress_headers:
        for field in extraaddress_headers:
            try:
                msgbits[field] = tuple(
                    [
                        tuple([auto_unicode_dang_it(person) for person in parseaddr(bit)])
                        for bit in msgobj[field].split(",")
                    ]
                )
            except (KeyError, AttributeError):
                msgbits[field] = tuple([(u"", u"")])
    if extraheaders:
        for field in extraheaders:
            try:
                msgbits[field] = auto_unicode_dang_it(msgobj[field])
            except (KeyError, AttributeError, ValueError):
                msgbits[field] = u""
    return msgbits, attachments