コード例 #1
0
def main():
    cfg = geoproc_cfg.config
    lookupconn = None
    lookupcur = None
    try:
        import mysql.connector as mdb
        lookupconn = mdb.connect(
          host=cfg.get("mysql", "maxmind_server"),
          user=cfg.get("mysql", "maxmind_read_username"),
          password=geoproc_cfg.db_password("maxmind_read_password_file"),
          db=cfg.get("mysql", "maxmind_schema"),
          use_unicode=True
        )
        lookupcur = lookupconn.cursor(cursor_class=geoproc_cfg.MySQLCursorDict)
    except:
        sys.stderr.write("Warning: Could not connect to database. Proceeding without database support.\n")
        pass

    annoconn, annocur = geoproc_library.connect_to_fs_anno_db(args.anno)

    outconn = sqlite3.connect(os.path.join(args.out_dir, "cookie_files_votes.db"))
    outconn.isolation_level = "EXCLUSIVE"
    outconn.row_factory = sqlite3.Row
    outcur = outconn.cursor()
    outcur.execute(SQL_CREATE_COOKIE_FILES_VOTES)
    
    #Walk cookie dump directory
    #TODO? Use a validly-extracted manifest file instead of walking dump directory.
    for (dirpath, dirnames, filenames) in os.walk(args.cookie_dump_dir):
        for cookie_txt_fname in (x for x in filenames if x.endswith("txt")):
            cookie_fiwalk_id = int(os.path.splitext(cookie_txt_fname)[0])
            dprint("Reading cookie_fiwalk_id: %r." % cookie_fiwalk_id)
            with open(os.path.join(dirpath, cookie_txt_fname),"r",encoding="utf-8") as cookie_file:
                try:
                    some_kibs = cookie_file.read(0x8000)
                except:
                    sys.stderr.write("Warning: Reading file %r failed.  Stack trace follows.\n" % cookie_txt_fname)
                    sys.stderr.write(traceback.format_exc())
                    continue
                if len(some_kibs) == 0x8000:
                    sys.stderr.write("Warning: Skipped abnormally large 'cookie' file, >=32KiB: %r.\n" % cookie_txt_fname)
                    continue
                votes = get_cookie_votes(outconn, lookupcur, annocur, cookie_fiwalk_id, some_kibs)
                for vote in votes:
                    geoproc_library.insert_db(outcur, "cookie_files_votes", vote)
    outconn.commit()
コード例 #2
0
def main():
    lookupconn = None
    lookupcur = None
    import mysql.connector as mdb
    lookupconn = mdb.connect(
      host=geoproc_cfg.config.get("mysql", "maxmind_server"),
      user=geoproc_cfg.config.get("mysql", "maxmind_read_username"),
      password=geoproc_cfg.db_password("maxmind_read_password_file"),
      db=geoproc_cfg.config.get("mysql", "maxmind_schema"),
      use_unicode=True
    )
    lookupcur = lookupconn.cursor(cursor_class=geoproc_cfg.MySQLCursorDict)
    
    for locfield in ["country","region","city"]:
        with open("distinct_%s.txt" % locfield, "w") as outfile:
            lookupcur.execute("""
              SELECT
                %(locfield)s
              FROM
                Distinct_%(locfield)s
              ;
            """ % {"locfield":locfield})
            for row in lookupcur:
                outfile.write(row[locfield] + "\n")
コード例 #3
0
def main():
    global args
    parser = argparse.ArgumentParser(description="Analyze Bulk Extractor EXIF output for location indicators.")
    parser.add_argument("-d", "--debug", action="store_true", help="Enable debug printing (writes to stderr).")
    parser.add_argument("-r", "--regress", action="store_true", help="Run regression tests and exit.")
    args_regress = parser.parse_known_args()[0]

##  Set up regular expressions for extracting desired EXIF tags
    relatref = re.compile(br"<Exif.GPSInfo.GPSLatitudeRef>(?P<GPSLatitudeRef>[NS])\</Exif.GPSInfo.GPSLatitudeRef>")
    relongref = re.compile(br"<Exif.GPSInfo.GPSLongitudeRef>(?P<GPSLongitudeRef>[EW])</Exif.GPSInfo.GPSLongitudeRef>")
    relat = re.compile(br"<Exif.GPSInfo.GPSLatitude>(?P<GPSLatitude>[0-9\-/ ]{1,40})</Exif.GPSInfo.GPSLatitude>")
    relong = re.compile(br"<Exif.GPSInfo.GPSLongitude>(?P<GPSLongitude>[0-9\-/ ]{1,40})</Exif.GPSInfo.GPSLongitude>")
    retimestamp = re.compile(br"<Exif.GPSInfo.GPSTimeStamp>(?P<GPSTimeStamp>[0-9/ ]{1,40})</Exif.GPSInfo.GPSTimeStamp>")
    redatestamp = re.compile(br"<Exif.GPSInfo.GPSDateStamp>(?P<GPSDateStamp>[0-9: .]{1,40})</Exif.GPSInfo.GPSDateStamp>")
    redatetime  = re.compile(br"<Exif.Image.DateTime>(?P<DateTime>[0-9: .]{1,40})</Exif.Image.DateTime>")
    
    if args_regress.regress:
        assert round(dms_to_decimal(b"33/1 49/1 42/1"), 4) == 33.8283
        assert round(dms_to_decimal(b"33/1 49/1 0/1"), 4) == round(dms_to_decimal(b"33/1 49/1"), 4)
        assert dms_to_decimal(b"0/0 0/0 0/0") is None #This value was observed in the Real Data Corpus.
        #TODO assert hms_fraction_to_decimal(b"8/1 49/1 18/1") == "8:49:18"
        #Sample EXIF data supplied by m57-redacted-terry-2009-12-07.aff, image offset 2116743168, pretty-printed with xmllint
        test_exif = b"""<?xml version="1.0"?>
<exif>
  <width>48</width>
  <height>48</height>
  <Exif.Image.Make>Apple</Exif.Image.Make>
  <Exif.Image.Model>iPhone</Exif.Image.Model>
  <Exif.Image.XResolution>72/1</Exif.Image.XResolution>
  <Exif.Image.YResolution>72/1</Exif.Image.YResolution>
  <Exif.Image.ResolutionUnit>2</Exif.Image.ResolutionUnit>
  <Exif.Image.DateTime>2008:11:26 11:46:56</Exif.Image.DateTime>
  <Exif.Image.ExifTag>180</Exif.Image.ExifTag>
  <Exif.Photo.FNumber>14/5</Exif.Photo.FNumber>
  <Exif.Photo.DateTimeOriginal>2008:11:26 11:46:56</Exif.Photo.DateTimeOriginal>
  <Exif.Photo.DateTimeDigitized>2008:11:26 11:46:56</Exif.Photo.DateTimeDigitized>
  <Exif.Photo.ColorSpace>1</Exif.Photo.ColorSpace>
  <Exif.Photo.PixelXDimension>1200</Exif.Photo.PixelXDimension>
  <Exif.Photo.PixelYDimension>1600</Exif.Photo.PixelYDimension>
  <Exif.Image.GPSTag>306</Exif.Image.GPSTag>
  <Exif.GPSInfo.GPSLatitudeRef>N</Exif.GPSInfo.GPSLatitudeRef>
  <Exif.GPSInfo.GPSLatitude>38/1 5354/100 0/1</Exif.GPSInfo.GPSLatitude>
  <Exif.GPSInfo.GPSLongitudeRef>W</Exif.GPSInfo.GPSLongitudeRef>
  <Exif.GPSInfo.GPSLongitude>92/1 2343/100 0/1</Exif.GPSInfo.GPSLongitude>
  <Exif.GPSInfo.GPSTimeStamp>11/1 46/1 788/100</Exif.GPSInfo.GPSTimeStamp>
  <Exif.Image.0xa500>11/5</Exif.Image.0xa500>
  <Exif.Thumbnail.Compression>6</Exif.Thumbnail.Compression>
  <Exif.Thumbnail.Orientation>6</Exif.Thumbnail.Orientation>
  <Exif.Thumbnail.XResolution>72/1</Exif.Thumbnail.XResolution>
  <Exif.Thumbnail.YResolution>72/1</Exif.Thumbnail.YResolution>
  <Exif.Thumbnail.ResolutionUnit>2</Exif.Thumbnail.ResolutionUnit>
  <Exif.Thumbnail.JPEGInterchangeFormat>550</Exif.Thumbnail.JPEGInterchangeFormat>
  <Exif.Thumbnail.JPEGInterchangeFormatLength>11682</Exif.Thumbnail.JPEGInterchangeFormatLength>
</exif>"""
        assert not relat.search(test_exif) is None
        assert not relong.search(test_exif) is None
        assert relatref.search(test_exif).group("GPSLatitudeRef") == b"N"
        assert relongref.search(test_exif).group("GPSLongitudeRef") == b"W"
        exit(0)

    parser.add_argument("-a", "--anno", help="Annotation database of Fiwalk and TSK-db")
    parser.add_argument("exif_file", type=argparse.FileType('rb'), help="Bulk Extractor exif.txt")
    args = parser.parse_args()

    dprint("Debug: args.anno = %r.\n" % args.anno)

##  Connect to db
    cfg = geoproc_cfg.config
    refconn = mysql.connector.Connect(
      host=cfg.get("mysql", "maxmind_server"),
      user=cfg.get("mysql", "maxmind_read_username"),
      password=geoproc_cfg.db_password("maxmind_read_password_file"),
      db=cfg.get("mysql", "maxmind_schema"),
      use_unicode=True
    )
    if refconn is None:
      raise Exception("Error: Could not define lookup cursor.")
    refcur = refconn.cursor(cursor_class=geoproc_cfg.MySQLCursorDict)

##  Connect to output db
    outconn = sqlite3.connect("exif_headers_votes.db")
    outconn.isolation_level = "EXCLUSIVE"
    outconn.row_factory = sqlite3.Row
    outcur = outconn.cursor()
    outcur.execute(SQL_CREATE_EXIF_HEADERS_VOTES)

##  Connect to anno db if available
    annoconn, annocur = geoproc_library.connect_to_fs_anno_db(args.anno)

    for binary_line in args.exif_file:
        binary_line_parts = binary_line.split(b"\t")
        if len(binary_line_parts) < 3:
            #We don't even have exif data. Skip.
            continue
        recdict = dict()
        recdict["forensic_path"] = str(binary_line_parts[0], "ascii")
        exif_data = binary_line_parts[2]
        match_exif_gps_lat = relat.search(exif_data)
        match_exif_gps_lon = relong.search(exif_data)
        #The above matches are essential
        if None in [match_exif_gps_lat, match_exif_gps_lon]:
            continue

        exif_gps_lat_decimal = dms_to_decimal(match_exif_gps_lat.group("GPSLatitude"))
        exif_gps_lon_decimal = dms_to_decimal(match_exif_gps_lon.group("GPSLongitude"))
        try:
            if not None in [exif_gps_lat_decimal, exif_gps_lon_decimal]:
                recdict["exif_gps_lat"] = round(exif_gps_lat_decimal, 4)
                recdict["exif_gps_lon"] = round(exif_gps_lon_decimal, 4)
        except IndexError:
            #Didn't find lat or long content. Warn and continue.
            sys.stderr.write("Warning: Couldn't find a lat (maybe long) from these matches:\n\t%r\n\t%r\n" % (match_exif_gps_lat.group(0), match_exif_gps_lon.group(0)))

        #This script's only purpose is finding lat/longs
        if None in [recdict.get("exif_gps_lat"), recdict.get("exif_gps_lon")]:
            continue

        #Lat/long references, we can guess: Default to N,E.
        match_exif_gps_latref = relatref.search(exif_data)
        match_exif_gps_longref = relongref.search(exif_data)
        exif_gps_latref = b"N"
        if match_exif_gps_latref:
            exif_gps_latref = match_exif_gps_latref.group("GPSLatitudeRef")
        exif_gps_longref = b"E"
        if match_exif_gps_longref:
            exif_gps_longref = match_exif_gps_longref.group("GPSLongitudeRef")

        if exif_gps_latref == b"S":
            recdict["exif_gps_lat"] *= -1
        if exif_gps_longref == b"W":
            recdict["exif_gps_lon"] *= -1

        #Times, we can guess from the file if we really need to.
        match_exif_timestamp = retimestamp.search(exif_data)
        if match_exif_timestamp:
            recdict["exif_gps_timestamp"] = hms_fraction_to_decimal(match_exif_timestamp.group("GPSTimeStamp"))
        match_exif_datestamp = redatestamp.search(exif_data)
        if match_exif_datestamp:
            recdict["exif_gps_datestamp"] = match_exif_datestamp.group("GPSDateStamp")
        match_exif_datetime = redatetime.search(exif_data)
        if match_exif_datetime:
            recdict["exif_datetime"] = match_exif_datetime.group("DateTime")
        #TODO integrate times into output

        refrecs = geoproc_library.latlongs_to_networked_locations(refcur, recdict["exif_gps_lat"], recdict["exif_gps_lon"], 30)
        if refrecs is None:
            recdict["database_queried"] = False
        else:
            recdict["database_queried"] = True
            #Get the nearest city within 30 miles
            if len(refrecs) > 0 and refrecs[0]["distance_miles"] < 30:
                refrec = refrecs[0]
                recdict["country"] = refrec["country"]
                recdict["region"] = refrec["region"]
                recdict["city"] = refrec["city"]
                recdict["postalCode"] = refrec["postalCode"]
                recdict["distance_miles"] = refrec["distance_miles"]

        #Note the name of the file containing this EXIF data, if available
        annorecs = geoproc_library.forensic_path_to_anno_recs(annocur, recdict["forensic_path"])

        if annorecs and len(annorecs) > 0:
            for annorec in annorecs:
                outdict = copy.deepcopy(recdict)
                outdict["fs_obj_id"] = annorec.get("fs_obj_id")
                outdict["obj_id"] = annorec.get("obj_id")
                outdict["fiwalk_id"] = annorec.get("fiwalk_id")

                #Look at file system path and say if we think it's in a cache
                if outdict.get("obj_id"):
                    annocur.execute("""
                      SELECT
                        full_path
                      FROM
                        tsk_file_full_paths
                      WHERE
                        obj_id = ?;
                    """, (outdict["obj_id"],))
                    pathrows = [row for row in annocur]
                    if len(pathrows) == 1:
                        outdict["file_in_web_cache"] = geoproc_library.path_in_web_cache(pathrows[0]["full_path"])

                #Output
                geoproc_library.insert_db(outcur, "exif_headers_votes", outdict)
        else:
            #Output to database without owning-file annotations
            geoproc_library.insert_db(outcur, "exif_headers_votes", recdict)
    outconn.commit()
コード例 #4
0
ファイル: analyze_ipv4s.py プロジェクト: ajnelson/geoproc
def main():
    global args
    #Connect to anno db if available
    annoconn, annocur = geoproc_library.connect_to_fs_anno_db(args.anno)

    #Connect to db
    cfg = geoproc_cfg.config
    refconn = mysql.connector.Connect(
      host=cfg.get("mysql", "maxmind_server"),
      user=cfg.get("mysql", "maxmind_read_username"),
      password=geoproc_cfg.db_password("maxmind_read_password_file"),
      db=cfg.get("mysql", "maxmind_schema"),
      use_unicode=True
    )
    if refconn is None:
        raise Exception("Error: Could not define lookup cursor.")
    refcur = refconn.cursor(cursor_class=geoproc_cfg.MySQLCursorDict)

    outconn = sqlite3.connect("ipv4s_votes.db")
    outconn.isolation_level = "EXCLUSIVE"
    outconn.row_factory = sqlite3.Row
    outcur = outconn.cursor()

    outcur.execute(SQL_CREATE_IPV4S_VOTES)

    pairing_dict = collections.defaultdict(list)
    ip_set = set([])
    for (ipno, (forensic_path, ipv4, ipv4_notes)) in enumerate(geoproc_library.bulk_extractor_ips(args.be_dir)):
        pairing_dict[forensic_path].append((ipv4, ipv4_notes))
        ip_set.add(ipv4)

    #Unfortunately, there isn't much to do for timestamps without file system or network time information. #TODO Add time interface
    dummy_dftime = dfxml.dftime("2009-05-01T00:00:00Z")

    ips_to_locs = geoproc_library.ips_to_locations(refcur, None, ip_set)

    for forensic_path in pairing_dict:
        #Determine if we have a pair
        entries_at_path = pairing_dict[forensic_path]
        pair_found = len(entries_at_path) == 2
        for (ipv4, ipv4_notes) in entries_at_path:
            outdict = dict()
            outdict["believed_timestamp"] = dummy_dftime.iso8601()
            outdict["forensic_path"] = forensic_path
            outdict["ipv4"] = ipv4
            outdict["ipv4_notes"] = ipv4_notes
            if "cksum-bad" in ipv4_notes:
                outdict["cksum_ok"] = False
            elif "cksum-ok" in ipv4_notes:
                outdict["cksum_ok"] = True
            #None, otherwise
            outdict["is_socket_address"] = "sockaddr" in ipv4_notes
            outdict["pair_found"] = pair_found
            if "(src)" in ipv4_notes:
                outdict["src_or_dst"] = "src"
            elif "dst" in ipv4_notes:
                outdict["src_or_dst"] = "dst"
            #None, otherwise 
            annorecs = geoproc_library.forensic_path_to_anno_recs(annocur, outdict["forensic_path"])
            if annorecs and len(annorecs) > 1:
                sys.stderr.write("Warning: Multiple files found to own forensic path %r. Only using first.  This may cause strange results.\n" % outdict["forensic_path"])
            if annorecs and len(annorecs) > 0:
                annorec = annorecs[0]
                outdict["obj_id"] = annorec.get("obj_id")
                outdict["fs_obj_id"] = annorec.get("fs_obj_id")
                outdict["fiwalk_id"] = annorec.get("fiwalk_id")

            if ipv4 in ips_to_locs:
                for key in [
                  "maxmind_ipv4_time",
                  "country",
                  "region",
                  "city",
                  "postalCode",
                  "latitude",
                  "longitude"
                ]:
                    outdict[key] = ips_to_locs[ipv4][key]

            geoproc_library.insert_db(outcur, "ipv4s_votes", outdict)
    outconn.commit()
コード例 #5
0
ファイル: aggregate.py プロジェクト: ajnelson/geoproc
def main():
    global args

    if os.path.isfile(args.output_db):
        if args.zap:
            os.remove(args.output_db)
        else:
            raise Exception("Output database already exists; aborting.")

    #Connect to location database to get names (necessary for wildcard matching, like if we just found a city)
    config = geoproc_cfg.config
    lookupconn = mdb.Connect(
      host=config.get("mysql", "maxmind_server"),
      user=config.get("mysql", "maxmind_read_username"),
      password=geoproc_cfg.db_password("maxmind_read_password_file"),
      db=config.get("mysql", "maxmind_schema"),
      use_unicode=True
    )
    lookupcur = lookupconn.cursor(cursor_class=geoproc_cfg.MySQLCursorDict)

    #Maybe connect to ground truth
    gtconn = None
    gtcur = None
    if args.ground_truth:
        gtconn = sqlite3.connect(args.ground_truth)
        gtconn.row_factory = sqlite3.Row
        #Don't lock database
        gtcur = gtconn.cursor()

    results_dir_list = geoproc_library.get_results_dirs(args.input_root)
    dprint("Aggregating %d directories." % len(results_dir_list))

    #Connect to output database
    outconn = sqlite3.connect(args.output_db)
    outconn.isolation_level = "EXCLUSIVE"
    outconn.row_factory = sqlite3.Row
    outcur = outconn.cursor()

    def add_columns(outcur, table_name):
        #Simple aggregate table: Just gets column for image_id
        outcur.execute("ALTER TABLE %s ADD COLUMN image_id TEXT;" % table_name)

        #Weighted aggregate table: Gets other columns to determine vote accuracy
        outcur.execute("CREATE TABLE %s_weighted AS SELECT * FROM %s;" % (table_name, table_name))
        outcur.execute("ALTER TABLE %s_weighted ADD COLUMN number_possible_locations NUMBER" % table_name)
        for bcol in ["country", "region", "city", "location"]:
            outcur.execute("ALTER TABLE %s_weighted ADD COLUMN correct_%s NUMBER;" % (table_name, bcol))

    outcur.execute(analyze_cookie_files.SQL_CREATE_COOKIE_FILES_VOTES)
    add_columns(outcur, "cookie_files_votes")
    outcur.execute(analyze_email_files.SQL_CREATE_EMAIL_FILES_VOTES)
    add_columns(outcur, "email_files_votes")
    outcur.execute(analyze_exif_headers.SQL_CREATE_EXIF_HEADERS_VOTES)
    add_columns(outcur, "exif_headers_votes")
    outcur.execute(analyze_ipv4s.SQL_CREATE_IPV4S_VOTES)
    add_columns(outcur, "ipv4s_votes")

    for results_dir in results_dir_list:
        try:
            ingest_table(outcur, lookupcur, gtcur, results_dir, "analyze_cookie_files.sh", "cookie_files_votes.db", "cookie_files_votes")
            ingest_table(outcur, lookupcur, gtcur, results_dir, "analyze_email_files.sh", "email_files_votes.db", "email_files_votes")
            ingest_table(outcur, lookupcur, gtcur, results_dir, "analyze_exif_headers.sh", "exif_headers_votes.db", "exif_headers_votes")
            ingest_table(outcur, lookupcur, gtcur, results_dir, "analyze_ipv4s.sh", "ipv4s_votes.db", "ipv4s_votes")
        except:
            dprint("Debug: Error occurred on results_dir %r." % results_dir)
            raise
    outconn.commit()
コード例 #6
0
def main():
    global args

    #Set up lookup database connection
    cfg = geoproc_cfg.config
    lookupconn = None
    lookupcur = None
    try:
        import mysql.connector as mdb
        lookupconn = mdb.connect(
          host=cfg.get("mysql", "maxmind_server"),
          user=cfg.get("mysql", "maxmind_read_username"),
          password=geoproc_cfg.db_password("maxmind_read_password_file"),
          db=cfg.get("mysql", "maxmind_schema"),
          use_unicode=True
        )
        lookupcur = lookupconn.cursor(cursor_class=geoproc_cfg.MySQLCursorDict)
    except:
        sys.stderr.write("Warning: Could not connect to database. Proceeding without database support.\n")
        pass

    #Connect to annodb
    annoconn, annocur = geoproc_library.connect_to_fs_anno_db(args.annodb)

    #Verify input
    manifest_path = os.path.join(args.emaildir, "manifest.txt")
    if not os.path.isfile(manifest_path):
        raise Exception("Error: manifest.txt not found in input directory.")

    #Ingest BE ips, if available
    #Stash in (once-tested) histogram.
    #Dictionary key: ipv4 address
    #Dictionary value: (notes, tally) default dictionary.
    ip_notes_histogram = collections.defaultdict(lambda: collections.defaultdict(lambda: 0))
    if args.bulk_extractor_output:
        for (forensic_path, ipv4, ipv4_notes) in geoproc_library.bulk_extractor_ips(args.bulk_extractor_output):
            ip_notes_histogram[ipv4][ipv4_notes] += 1
    dprint("Debug: Number of IPv4s with notes: %d." % len(ip_notes_histogram.keys()))

    #Set up output database
    outdbpath = os.path.join(args.outdir, "email_files_votes.db")
    if os.path.isfile(outdbpath):
        raise Exception("Error: Output database already exists. This script won't overwrite. Aborting.")
    outconn = sqlite3.connect(outdbpath)
    outconn.isolation_level = "EXCLUSIVE"
    outconn.row_factory = sqlite3.Row
    outcur = outconn.cursor()
    outcur.execute(SQL_CREATE_EMAIL_FILES_VOTES)

    for (fiwalk_id, messageno, message) in emails_in_dir_manifest(manifest_path):
        dprint("Debug: Analyzing a record from fiwalk_id %r." % fiwalk_id)
        #print(repr(type(message)))
        #for i in message.keys():
        #    print('%r: %r' % (i, message.get_all(i)))
        received_recs = message.get_all("Received")
        if not received_recs:
            continue
        pathlength = len(received_recs)
        for (pathindex, pathline) in enumerate(received_recs):
            #TODO Just getting all the IPs for now; filter later
            ips = geoproc_library.all_ipv4s(pathline)
            dprint("Debug: Found this many IP's: %d.\n\t%r" % (len(ips), ips))
            
            #Can we get a date?
            maybe_timestamp = None
            maybe_timestamp_match = dfxml.rx_rfc822datetime.search(pathline)
            if maybe_timestamp_match:
                thestring = maybe_timestamp_match.string
                thespan = maybe_timestamp_match.span()
                thedatestring = thestring[thespan[0]:thespan[1]]
                try:
                    maybe_timestamp = dfxml.dftime(thedatestring)
                except:
                    sys.stderr.write("Warning: An error occured trying to parse time input.\nInput:%r\nStack trace:\n" % thedatestring)
                    sys.stderr.write(traceback.format_exc())
                    sys.stderr.write("\n")
                    #Don't stop here.
            dprint("Debug: Believed timestamp: %r." % maybe_timestamp)
            
            #Now that we have a date, can we get locations?
            if maybe_timestamp:

                #Can we get a single recipient?  (This is, of course, not guaranteed to be the owner.)
                sole_recipient = None
                delivered_to_headers = message.get_all("Delivered-To")
                to_headers = message.get_all("To")
                if delivered_to_headers and len(delivered_to_headers) == 1:
                    sole_recipient = delivered_to_headers[0]
                elif to_headers and len(to_headers) == 1 and len(to_headers[0].split("\n")) == 1:
                    sole_recipient = to_headers[0]
                all_ip_locations = geoproc_library.ips_to_locations(lookupcur, maybe_timestamp.datetime(), ips)
                dprint("Debug: Fetched these IP location records:\n\t%r" % all_ip_locations)
                for ip in ips:
                    outdict = {"fiwalk_id":fiwalk_id}
                    #TODO Use annodb to get TSK identifiers
                    outdict["message_index"] = messageno
                    outdict["ipv4"] = ip
                    outdict["received_path_index"] = pathindex
                    outdict["received_path_length"] = pathlength
                    outdict["received_header_text"] = pathline
                    outdict["database_queried"] = all_ip_locations is not None
                    outdict["believed_timestamp"] = str(maybe_timestamp)
                    outdict["sole_recipient_domain_is_webmail"] = geoproc_library.in_webmail_domain(sole_recipient)
                    if all_ip_locations is not None and ip in all_ip_locations:
                        rec = all_ip_locations[ip]
                        outdict["latitude"] = rec.get("latitude")
                        outdict["longitude"] = rec.get("longitude")
                        outdict["postalCode"] = rec.get("postalCode")
                        outdict["maxmind_ipv4_time"] = dfxml.dftime(rec.get("maxmind_ipv4_time")).iso8601()
                        if rec.get("country"):
                            outdict["country"] = rec["country"]
                        if rec.get("region"):
                            outdict["region"] = rec["region"]
                        if rec.get("city"):
                            outdict["city"] = rec["city"]
                        dprint("Debug: Checking for IP notes for %r." % ip)
                        if ip in ip_notes_histogram:
                            dprint("Debug: Formatting notes for %r." % ip)
                            notedict = ip_notes_histogram[ip]
                            notelist = sorted(notedict.keys())
                            notes_to_format = []
                            for note in notelist:
                                notes_to_format.append("%d %r" % (notedict[note], note))
                            outdict["ipv4_be_notes"] = "; ".join(notes_to_format)
                            outdict["ipv4_be_has_cksum_or_socket"] = "sockaddr" in outdict["ipv4_be_notes"] or "cksum-ok" in outdict["ipv4_be_notes"]
                        dprint("Debug: Outdict just before inserting:\n\t%r" % outdict)
                    geoproc_library.insert_db(outcur, "email_files_votes", outdict)
    outconn.commit()
    dprint("Debug: Done.")
コード例 #7
0
def main():
##  The program outline is embedded inline with double-hash comments (fit for grepping).

##  Parse parameters
    parser = argparse.ArgumentParser(description="Create and populate MySQL database with MaxMind data.")
    parser.add_argument("-r", "--regress", action="store_true", dest="regress", help="Run unit tests and exit.  Requires database connection.")
    parser.add_argument("-d", "--debug", action="store_true", dest="debug", help="Print debug information to stderr.")
    parser.add_argument("zips", help="Text file of MaxMind zips to ingest, one absolute path per line; or a directory containing the zips; or just one zip.")
    args = parser.parse_args()
    
    input_file_list = []
    if args.zips.endswith(".zip"):
        input_file_list.append(os.path.abspath(os.path.expanduser(args.zips)))
    elif os.path.isdir(args.zips):
        for zipdirentry in os.listdir(args.zips):
            if zipdirentry.endswith(".zip"):
                input_file_list.append(os.path.abspath(os.path.join(args.zips, zipdirentry)))
    else:
        with open(args.zips, "r") as inputFile:
            for line in inputFile:
                maxmind_zip_path = line.strip()
                if not os.path.isabs(maxmind_zip_path):
                    raise ValueError("Paths in ziplist file must be absolute.")
                input_file_list.append(maxmind_zip_path)
            
##  Connect to db
    #Not using a try block - the whole point of this script is filling the database
    config = geoproc_cfg.config
    con = mdb.Connect(
      host=config.get("mysql", "maxmind_server"),
      user=config.get("mysql", "maxmind_write_username"),
      password=geoproc_cfg.db_password("maxmind_write_password_file"),
      db=config.get("mysql", "maxmind_schema"),
      use_unicode=True
    )
    cur = con.cursor(cursor_class=geoproc_cfg.MySQLCursorDict)
    cur.execute("SET NAMES 'utf8';")

##  Run regress tests?
        #TODO
        
##  Create tables if non-existent
    if args.debug:
        sys.stderr.write("Creating tables...\n")
    #Datetime vs timestamp: <http://stackoverflow.com/a/409305/1207160>
    cur.execute(SQL_CREATE_ingest_audit_zips)
    cur.execute(SQL_CREATE_ingest_audit_locations)
    cur.execute(SQL_CREATE_ingest_audit_blocks)
    cur.execute(SQL_CREATE_LocationTable)
    cur.execute(SQL_CREATE_BlockTable)
    cur.execute(SQL_CREATE_LocationLatLongs)
            
##  Record the number of location tables processed - non-zero triggers a table refresh later
    tally_location_updates = 0
    

##  Read table of processed zip files
##      # Should contain: Zip file base name, full path to zip file, datetime started, datetime ended
##  Read table of processed block files
##      # Should contain: Zip file base name (foreign key), mtime, datetime started, datetime ended
##  Read table of processed location files
##      # Should contain: Zip file base name (foreign key), mtime, datetime started, datetime ended
##  Read input list of zip files to ingest
    for maxmind_zip_path in input_file_list:
        maxmind_zip_basename = os.path.basename(maxmind_zip_path)
##  For each zip in the list:
##      If the zip had not finished processing previously:
        previously_finished_zip = False
        reset_this_zip = False
        cur.execute("SELECT * FROM ingest_audit_zips WHERE zip_basename = %s;", (maxmind_zip_basename,))
        for row in cur.fetchall():
            if row["datetime_end"] is None:
                #This row shouldn't be here at all.
                reset_this_zip = True
            else:
                previously_finished_zip = True
        if previously_finished_zip:
            sys.stderr.write("Warning: Zip file already read into database... Skipping zip file:" + maxmind_zip_path + "\n")
            continue

##          If the block file had started and not finished previously:
        cur.execute("SELECT * FROM ingest_audit_blocks WHERE zip_basename = %s AND datetime_start IS NOT NULL AND datetime_end IS NULL;", (maxmind_zip_basename,))
        rows = [row for row in cur.fetchall()]
        for row in rows:
            sys.stderr.write("Note: Found incomplete Block ingest for zip %r, started %r. Deleting records.\n" % (row["zip_basename"], row["datetime_start"]))
##              Delete block records with matching mtime
            cur.execute("DELETE FROM BlockTable WHERE lastModifiedTime = %s;", (row["mtime"],))
        if len(rows) > 0:
            cur.execute("DELETE FROM ingest_audit_blocks WHERE zip_basename = %s AND datetime_end IS NULL;", (maxmind_zip_basename,))

##          If the location file had started and not finished previously:
        cur.execute("SELECT * FROM ingest_audit_locations WHERE zip_basename = %s AND datetime_start IS NOT NULL AND datetime_end IS NULL;", (maxmind_zip_basename,))
##              Delete block records with matching mtime
        rows = [row for row in cur.fetchall()]
        for row in rows:
            sys.stderr.write("Note: Found incomplete Location ingest for zip %r, started %r. Deleting records.\n" % (row["zip_basename"], row["datetime_start"]))
            cur.execute("DELETE FROM LocationLatLongs WHERE lastModifiedTime = %s;", (row["mtime"],))
            cur.execute("DELETE FROM LocationTable WHERE lastModifiedTime = %s;", (row["mtime"],))
        if len(rows) > 0:
            cur.execute("DELETE FROM ingest_audit_locations WHERE zip_basename = %s AND datetime_end IS NULL;", (maxmind_zip_basename,))

##          Mark start time for processing zip
        starttime_zip = datetime.datetime.now()
        if reset_this_zip:
            cur.execute("DELETE FROM ingest_audit_zips WHERE zip_basename = %s;", (maxmind_zip_basename,))
        #strftime appears to be the way to insert datetimes from Python to MySQL <http://stackoverflow.com/a/4508923/1207160>
        cur.execute("INSERT INTO ingest_audit_zips(zip_basename, zip_full_path, datetime_start) VALUES (%s,%s,%s);", (maxmind_zip_basename, maxmind_zip_path, datetime.datetime.strftime(starttime_zip, "%Y-%m-%d %H:%M:%S")))
        con.commit()

##          Get zip internal listing
        with zipfile.ZipFile(maxmind_zip_path, "r") as this_zip:
            zipped_blocks_name = None
            zipped_locations_name = None
            for zi in this_zip.infolist():
                if zi.filename.endswith("Location.csv"):
                    zipped_locations_name = zi.filename
                    zipped_locations_time = datetime_from_ziptime(zi.date_time)
                elif zi.filename.endswith("Blocks.csv"):
                    zipped_blocks_name = zi.filename
                    zipped_blocks_time = datetime_from_ziptime(zi.date_time)

##          If the location file had not finished processing previously:
            should_process_locations = False
            if zipped_locations_name is not None:
                zipped_locations_time_str = datetime.datetime.strftime(zipped_locations_time, "%Y-%m-%d %H:%M:%S")
                cur.execute("SELECT COUNT(*) AS tally FROM ingest_audit_locations WHERE zip_basename = %s AND datetime_end IS NOT NULL;", (maxmind_zip_basename,))
                for rec in cur.fetchall():
                    should_process_locations = (rec["tally"] == 0)
            if should_process_locations:        
##              Mark start time
                starttime_locations = datetime.datetime.now()
                cur.execute("INSERT INTO ingest_audit_locations(zip_basename, mtime, datetime_start) VALUES (%s,%s,%s);", (maxmind_zip_basename, zipped_locations_time_str, datetime.datetime.strftime(starttime_locations, "%Y-%m-%d %H:%M:%S")))
                con.commit()
##              Ingest
                with this_zip.open(zipped_locations_name, "r") as locations_file:
                    if args.debug:
                        sys.stderr.write("Note: Ingesting %s / %s...\n" % (maxmind_zip_path, zipped_locations_name))
                    lineCount = 0
                    locations_file_wrapped = io.TextIOWrapper(locations_file, encoding="iso-8859-1") #MaxMind content encoding: http://dev.maxmind.com/geoip/csv
                    locations_reader = csv.reader(locations_file_wrapped, delimiter=',', quoting=csv.QUOTE_ALL)
                    for row in locations_reader:
                        if len(row) == 9  and row[0].isdigit():
                            cur.execute("INSERT INTO LocationTable VALUES(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)", (int(row[0]),zipped_locations_time_str,row[1],row[2],row[3],row[4],row[5],row[6],row[7],row[8] ))
                        lineCount = lineCount + 1
                        if lineCount % 10000 == 0 :
                            if args.debug:
                                sys.stderr.write("\tOn locations CSV record %d...\n" % lineCount)
                            con.commit()
                    con.commit()
##              Update lat/long index
                if args.debug:
                    sys.stderr.write("Note: Updating lat/long index.\n")
                cur.execute("""
                  INSERT INTO LocationLatLongs (latlong, locId, lastModifiedTime)
                    SELECT
                      POINT(latitude, longitude),
                      locId,
                      lastModifiedTime
                    FROM
                      LocationTable
                    WHERE
                      lastModifiedTime = %s
                    ;
                """, (zipped_locations_time_str,))
                con.commit()
##              Mark end time
                endtime_locations = datetime.datetime.now()
                cur.execute("UPDATE ingest_audit_locations SET datetime_end = %s WHERE zip_basename = %s;", (datetime.datetime.strftime(endtime_locations, "%Y-%m-%d %H:%M:%S"), maxmind_zip_basename))
                con.commit()
                tally_location_updates += 1

##          If the block file had not finished processing previously:
            should_process_blocks = False
            if zipped_blocks_name is not None:
                zipped_blocks_time_str = datetime.datetime.strftime(zipped_blocks_time, "%Y-%m-%d %H:%M:%S")
                cur.execute("SELECT COUNT(*) AS tally FROM ingest_audit_blocks WHERE zip_basename = %s AND datetime_end IS NOT NULL;", (maxmind_zip_basename,))
                for rec in cur.fetchall():
                    should_process_blocks = (rec["tally"] == 0)
            if should_process_blocks:        
##              Mark start time
                starttime_blocks = datetime.datetime.now()
                cur.execute("INSERT INTO ingest_audit_blocks(zip_basename, mtime, datetime_start) VALUES (%s,%s,%s);", (maxmind_zip_basename, zipped_blocks_time_str, datetime.datetime.strftime(starttime_blocks, "%Y-%m-%d %H:%M:%S")))
                con.commit()
##              Ingest
                with this_zip.open(zipped_blocks_name, "r") as blocks_file:
                    if args.debug:
                        sys.stderr.write("Note: Ingesting %s / %s...\n" % (maxmind_zip_path, zipped_blocks_name))
                    lineCount = 0
                    blocks_file_wrapped = io.TextIOWrapper(blocks_file, encoding="iso-8859-1") #MaxMind content encoding: http://dev.maxmind.com/geoip/csv
                    blocks_reader = csv.reader(blocks_file_wrapped, delimiter=',', quoting=csv.QUOTE_ALL)
                    for row in blocks_reader:
                        if len(row) == 3  and row[0].isdigit():
                            cur.execute("INSERT INTO BlockTable VALUES(%s,%s,%s,%s)", (row[0],row[1],row[2],zipped_blocks_time_str))
                        lineCount = lineCount + 1
                        if lineCount % 10000 == 0 :
                            if args.debug:
                                sys.stderr.write("\tOn blocks CSV record %d...\n" % lineCount)
                            con.commit()
                    con.commit()
##              Mark end time
                endtime_blocks = datetime.datetime.now()
                cur.execute("UPDATE ingest_audit_blocks SET datetime_end = %s WHERE zip_basename = %s;", (datetime.datetime.strftime(endtime_blocks, "%Y-%m-%d %H:%M:%S"), maxmind_zip_basename))
                con.commit()
##          Mark end time
        endtime_blocks = datetime.datetime.now()
        cur.execute("UPDATE ingest_audit_zips SET datetime_end = %s WHERE zip_basename = %s;", (datetime.datetime.strftime(endtime_blocks, "%Y-%m-%d %H:%M:%S"), maxmind_zip_basename))
        con.commit()
    #If we updated the LocationTable, refresh the list of all distinct location triples
    if tally_location_updates > 0:
        if args.debug:
            sys.stderr.write("Note: Refreshing DistinctLocations table.\n")
        cur.execute(SQL_DROP_DistinctLocations)
        cur.execute(SQL_CREATE_DistinctLocations)
        for locfield in ["country", "region", "city"]:
            formatdict = {"locfield":locfield}
            cur.execute("DROP TABLE IF EXISTS Distinct_%(locfield)s;" % formatdict)
            con.commit()
            cur.execute("""
              CREATE TABLE Distinct_%(locfield)s AS
              SELECT DISTINCT
                %(locfield)s
              FROM
                DistinctLocations
              WHERE
                %(locfield)s IS NOT NULL AND
                %(locfield)s <> ""
              ORDER BY
                %(locfield)s
              ;
            """ % formatdict)
        con.commit()
##  Done.

    if con:
        con.close()