def parse_body(body):
    """
        Parse the body from the email and extract the required fields. 
        Need to extract sender email, subject of the email, the receive date, and body of the email.
    """
    msg = BytesParser(policy=policy.SMTP).parsebytes(body)
    print("This is the message: ", msg.keys())
    print("From : ",msg['From'])
    print("Date: ",msg['Date'])
    print("To: ",msg['To'])
    print("Subject : ",msg['Subject'])
    plain = ''
    try:
        plain = msg.get_body(preferencelist=('plain'))
        plain = ''.join(plain.get_content().splitlines(keepends=True))
        plain = '' if plain == None else plain
    except:
        print('Incoming message does not have an plain text part - skipping this part.')
        
    return {
        'from': msg['From'],
        'to': msg['To'],
        'subject': msg['Subject'],
        'date': msg['Date'],
        'text':plain
        }
Ejemplo n.º 2
0
def process_probe(row):
    if not row["data"].startswith(b"HTTP/"):
        return {}  # TODO: do some kind of content analysis

    #print(row["data"], "\n")
    response = row["data"].replace(b"\r\n\r\n", b"\n\n", 1)

    try:
        # split in headers and content
        raw_headers, content = response.split(b"\n\n", 1)
        request_line, headers_alone = raw_headers.split(b"\r\n", 1)
    except ValueError as e:
        return {}

    # parse first line
    try:
        protocol, status_code, status_text, version = None, None, None, None
        protocol, status_code, status_text = request_line.split(b" ", 2)
        protocol, version = protocol.split(b"/", 1)
    except ValueError as e:
        pass

    # get headers
    headers = BytesParser().parsebytes(headers_alone)

    server = headers.get("Server", "")
    date = headers.get("Date", "")
    content_type = headers.get("Content-Type", "")
    transfer_encoding = list(
        map(lambda s: s.strip(),
            headers.get("Transfer-Encoding", "").split(",")))

    charset = "utf-8"
    if "charset=" in content_type:
        charset = content_type[content_type.find("charset=") +
                               len("charset="):]
        if charset == "undef":
            charset = "utf-8"
        try:
            codecs.lookup(charset)
        except LookupError:
            charset = "utf-8"

    if "chunked" in transfer_encoding:
        # the content is chunked and needs to be merged
        content = merge_chunks(content)

    # parse html
    tag_tree = ""
    try:
        tree = html.fromstring(content)
        tag_tree = tag_recursive(tree)
    except ParserError as e:
        pass

    data = {}

    probe_type = row["type"]

    try:
        # TODO: IIS server is dick and may return decimals in status_code :shrug:
        try:
            data["{}:status_code".format(probe_type)] = float(status_code)
        except ValueError:
            data["{}:status_code".format(probe_type)] = -1
    except TypeError:
        data["{}:status_code".format(probe_type)] = None
    try:
        data["{}:status_text".format(probe_type)] = status_text
    except AttributeError:
        data["{}:status_text".format(probe_type)] = None
    try:
        data["{}:header_keys".format(probe_type)] = headers.keys()
    except TypeError:
        data["{}:header_keys".format(probe_type)] = None

    for header in headers:
        data["{}:header:{}".format(probe_type, header)] = headers[header]
    data["{}:dom_tree".format(probe_type)] = tag_tree

    return data