Example #1
0
def get_cookie_votes(outconn, lookupcur, annocur, cookie_fiwalk_id, cookie_contents):
    """
    Input: Database connection (if not live, this is nearly a nop), entire contents of cookie (this can come from HTTP header or file)
    Output: All geographic votes from cookie contents
    """
    cookie_contents_lower = cookie_contents.lower()
    retlist = []

    basic_vote = dict()
    basic_vote["fiwalk_id"] = cookie_fiwalk_id

    #Fill in file system info from annodb
    if annocur:
        annocur.execute("""
          SELECT
            tsk_obj_id,
            tf.fs_obj_id,
            tf.mtime,
            tf.atime,
            tf.ctime,
            tf.crtime
          FROM
            fiwalk_id_to_tsk_obj_id,
            tskout.tsk_files AS tf
          WHERE
            fiwalk_id_to_tsk_obj_id.tsk_obj_id = tf.obj_id AND
            fiwalk_id = ?
          ;
        """, (cookie_fiwalk_id,))
        annorows = [row for row in annocur]
        if len(annorows) == 1:
            basic_vote["fs_obj_id"] = annorows[0]["fs_obj_id"]
            basic_vote["obj_id"] = annorows[0]["tsk_obj_id"]
            for timefield in ["mtime", "atime", "ctime", "crtime"]:
                annorow = {key:annorows[0][key] for key in annorows[0].keys()}
                if annorow.get(timefield):
                    basic_vote["selected_time_type"] = timefield
                    basic_vote["believed_timestamp"] = dfxml.dftime(annorow[timefield]).iso8601()
                    break

    #TODO use
    city_matches = match_cities(lookupcur, cookie_contents)

    #Perform MSN matches
    msn_locations = all_msn_matches(cookie_contents)
    for m in msn_locations:
        if not (m.get("longitude") and m.get("latitude")):
            continue
        retdict = copy.deepcopy(basic_vote)
        retdict["record_type"] = "msn"
        retdict["latitude"] = float(m["latitude"])
        retdict["longitude"] = float(m["longitude"])
        if m.get("countrycode"):
            retdict["country"] = m["countrycode"]
            retdict["postalCode"] = m["zipcode"]
        locations_from_latlongs = geoproc_library.latlongs_to_networked_locations(lookupcur, retdict["latitude"], retdict["longitude"], 30)
        if locations_from_latlongs is None:
            sys.stderr.write("Warning: Couldn't look up latitude/longitude.\n")
            retdict["database_queried"] = False
        if locations_from_latlongs is not None and len(locations_from_latlongs) > 0:
            #Use closest location
            locrec = locations_from_latlongs[0]
            for locfield in ["country", "postalCode"]:
                if locrec.get(locfield):
                    if retdict.get(locfield):
                        retdict["cookie_latlong_and_maxmind_agree_on_" + locfield] = locrec[locfield] == retdict[locfield]
                        if not retdict["cookie_latlong_and_maxmind_agree_on_" + locfield]:
                            sys.stderr.write("Warning: Data anomaly: MSN cookie reports %s %r, lat/long seem to be in %r by MaxMind." % (locfield, retdict["country"], locrec["country"]))
                    else:
                        retdict[locfield] = locrec[locfield]
            retdict["region"] = locrec["region"]
            retdict["city"] = locrec["city"]
            retdict["database_queried"] = True
        retlist.append(retdict)

    #Perform IPv4 text matches
    all_ips = geoproc_library.all_ipv4s(cookie_contents)
    for ipv4 in all_ips:
        believed_cookie_time = None #TODO Get actual times from database, loop through them
        all_ip_locations = geoproc_library.ips_to_locations(lookupcur, believed_cookie_time, all_ips)
        retdict = copy.deepcopy(basic_vote)
        retdict["record_type"] = "ipv4"
        retdict["ipv4"] = ipv4
        retdict["database_queried"] = all_ip_locations is not None
        if all_ip_locations is not None and ipv4 in all_ip_locations:
            rec = all_ip_locations[ipv4]
            retdict["latitude"] = rec.get("latitude")
            retdict["longitude"] = rec.get("longitude")
            retdict["postalCode"] = rec.get("postalCode")
            retdict["maxmind_ipv4_time"] = dfxml.dftime(rec.get("maxmind_ipv4_time")).iso8601()
            if rec.get("country"):
                retdict["country"] = rec["country"]
                retdict["country_found_in_text"] = rec["country"].lower() in cookie_contents_lower
            if rec.get("region"):
                retdict["region"] = rec["region"]
                retdict["region_found_in_text"] = rec["region"].lower() in cookie_contents_lower
            if rec.get("city"):
                retdict["city"] = rec["city"]
                retdict["city_found_in_text"] = rec["city"].lower() in cookie_contents_lower
        retlist.append(retdict)
    return retlist
Example #2
0
def main():
    global args

    #Set up lookup database connection
    cfg = geoproc_cfg.config
    lookupconn = None
    lookupcur = None
    try:
        import mysql.connector as mdb
        lookupconn = mdb.connect(
          host=cfg.get("mysql", "maxmind_server"),
          user=cfg.get("mysql", "maxmind_read_username"),
          password=geoproc_cfg.db_password("maxmind_read_password_file"),
          db=cfg.get("mysql", "maxmind_schema"),
          use_unicode=True
        )
        lookupcur = lookupconn.cursor(cursor_class=geoproc_cfg.MySQLCursorDict)
    except:
        sys.stderr.write("Warning: Could not connect to database. Proceeding without database support.\n")
        pass

    #Connect to annodb
    annoconn, annocur = geoproc_library.connect_to_fs_anno_db(args.annodb)

    #Verify input
    manifest_path = os.path.join(args.emaildir, "manifest.txt")
    if not os.path.isfile(manifest_path):
        raise Exception("Error: manifest.txt not found in input directory.")

    #Ingest BE ips, if available
    #Stash in (once-tested) histogram.
    #Dictionary key: ipv4 address
    #Dictionary value: (notes, tally) default dictionary.
    ip_notes_histogram = collections.defaultdict(lambda: collections.defaultdict(lambda: 0))
    if args.bulk_extractor_output:
        for (forensic_path, ipv4, ipv4_notes) in geoproc_library.bulk_extractor_ips(args.bulk_extractor_output):
            ip_notes_histogram[ipv4][ipv4_notes] += 1
    dprint("Debug: Number of IPv4s with notes: %d." % len(ip_notes_histogram.keys()))

    #Set up output database
    outdbpath = os.path.join(args.outdir, "email_files_votes.db")
    if os.path.isfile(outdbpath):
        raise Exception("Error: Output database already exists. This script won't overwrite. Aborting.")
    outconn = sqlite3.connect(outdbpath)
    outconn.isolation_level = "EXCLUSIVE"
    outconn.row_factory = sqlite3.Row
    outcur = outconn.cursor()
    outcur.execute(SQL_CREATE_EMAIL_FILES_VOTES)

    for (fiwalk_id, messageno, message) in emails_in_dir_manifest(manifest_path):
        dprint("Debug: Analyzing a record from fiwalk_id %r." % fiwalk_id)
        #print(repr(type(message)))
        #for i in message.keys():
        #    print('%r: %r' % (i, message.get_all(i)))
        received_recs = message.get_all("Received")
        if not received_recs:
            continue
        pathlength = len(received_recs)
        for (pathindex, pathline) in enumerate(received_recs):
            #TODO Just getting all the IPs for now; filter later
            ips = geoproc_library.all_ipv4s(pathline)
            dprint("Debug: Found this many IP's: %d.\n\t%r" % (len(ips), ips))
            
            #Can we get a date?
            maybe_timestamp = None
            maybe_timestamp_match = dfxml.rx_rfc822datetime.search(pathline)
            if maybe_timestamp_match:
                thestring = maybe_timestamp_match.string
                thespan = maybe_timestamp_match.span()
                thedatestring = thestring[thespan[0]:thespan[1]]
                try:
                    maybe_timestamp = dfxml.dftime(thedatestring)
                except:
                    sys.stderr.write("Warning: An error occured trying to parse time input.\nInput:%r\nStack trace:\n" % thedatestring)
                    sys.stderr.write(traceback.format_exc())
                    sys.stderr.write("\n")
                    #Don't stop here.
            dprint("Debug: Believed timestamp: %r." % maybe_timestamp)
            
            #Now that we have a date, can we get locations?
            if maybe_timestamp:

                #Can we get a single recipient?  (This is, of course, not guaranteed to be the owner.)
                sole_recipient = None
                delivered_to_headers = message.get_all("Delivered-To")
                to_headers = message.get_all("To")
                if delivered_to_headers and len(delivered_to_headers) == 1:
                    sole_recipient = delivered_to_headers[0]
                elif to_headers and len(to_headers) == 1 and len(to_headers[0].split("\n")) == 1:
                    sole_recipient = to_headers[0]
                all_ip_locations = geoproc_library.ips_to_locations(lookupcur, maybe_timestamp.datetime(), ips)
                dprint("Debug: Fetched these IP location records:\n\t%r" % all_ip_locations)
                for ip in ips:
                    outdict = {"fiwalk_id":fiwalk_id}
                    #TODO Use annodb to get TSK identifiers
                    outdict["message_index"] = messageno
                    outdict["ipv4"] = ip
                    outdict["received_path_index"] = pathindex
                    outdict["received_path_length"] = pathlength
                    outdict["received_header_text"] = pathline
                    outdict["database_queried"] = all_ip_locations is not None
                    outdict["believed_timestamp"] = str(maybe_timestamp)
                    outdict["sole_recipient_domain_is_webmail"] = geoproc_library.in_webmail_domain(sole_recipient)
                    if all_ip_locations is not None and ip in all_ip_locations:
                        rec = all_ip_locations[ip]
                        outdict["latitude"] = rec.get("latitude")
                        outdict["longitude"] = rec.get("longitude")
                        outdict["postalCode"] = rec.get("postalCode")
                        outdict["maxmind_ipv4_time"] = dfxml.dftime(rec.get("maxmind_ipv4_time")).iso8601()
                        if rec.get("country"):
                            outdict["country"] = rec["country"]
                        if rec.get("region"):
                            outdict["region"] = rec["region"]
                        if rec.get("city"):
                            outdict["city"] = rec["city"]
                        dprint("Debug: Checking for IP notes for %r." % ip)
                        if ip in ip_notes_histogram:
                            dprint("Debug: Formatting notes for %r." % ip)
                            notedict = ip_notes_histogram[ip]
                            notelist = sorted(notedict.keys())
                            notes_to_format = []
                            for note in notelist:
                                notes_to_format.append("%d %r" % (notedict[note], note))
                            outdict["ipv4_be_notes"] = "; ".join(notes_to_format)
                            outdict["ipv4_be_has_cksum_or_socket"] = "sockaddr" in outdict["ipv4_be_notes"] or "cksum-ok" in outdict["ipv4_be_notes"]
                        dprint("Debug: Outdict just before inserting:\n\t%r" % outdict)
                    geoproc_library.insert_db(outcur, "email_files_votes", outdict)
    outconn.commit()
    dprint("Debug: Done.")