def main(): cfg = geoproc_cfg.config lookupconn = None lookupcur = None try: import mysql.connector as mdb lookupconn = mdb.connect( host=cfg.get("mysql", "maxmind_server"), user=cfg.get("mysql", "maxmind_read_username"), password=geoproc_cfg.db_password("maxmind_read_password_file"), db=cfg.get("mysql", "maxmind_schema"), use_unicode=True ) lookupcur = lookupconn.cursor(cursor_class=geoproc_cfg.MySQLCursorDict) except: sys.stderr.write("Warning: Could not connect to database. Proceeding without database support.\n") pass annoconn, annocur = geoproc_library.connect_to_fs_anno_db(args.anno) outconn = sqlite3.connect(os.path.join(args.out_dir, "cookie_files_votes.db")) outconn.isolation_level = "EXCLUSIVE" outconn.row_factory = sqlite3.Row outcur = outconn.cursor() outcur.execute(SQL_CREATE_COOKIE_FILES_VOTES) #Walk cookie dump directory #TODO? Use a validly-extracted manifest file instead of walking dump directory. for (dirpath, dirnames, filenames) in os.walk(args.cookie_dump_dir): for cookie_txt_fname in (x for x in filenames if x.endswith("txt")): cookie_fiwalk_id = int(os.path.splitext(cookie_txt_fname)[0]) dprint("Reading cookie_fiwalk_id: %r." % cookie_fiwalk_id) with open(os.path.join(dirpath, cookie_txt_fname),"r",encoding="utf-8") as cookie_file: try: some_kibs = cookie_file.read(0x8000) except: sys.stderr.write("Warning: Reading file %r failed. Stack trace follows.\n" % cookie_txt_fname) sys.stderr.write(traceback.format_exc()) continue if len(some_kibs) == 0x8000: sys.stderr.write("Warning: Skipped abnormally large 'cookie' file, >=32KiB: %r.\n" % cookie_txt_fname) continue votes = get_cookie_votes(outconn, lookupcur, annocur, cookie_fiwalk_id, some_kibs) for vote in votes: geoproc_library.insert_db(outcur, "cookie_files_votes", vote) outconn.commit()
def main(): lookupconn = None lookupcur = None import mysql.connector as mdb lookupconn = mdb.connect( host=geoproc_cfg.config.get("mysql", "maxmind_server"), user=geoproc_cfg.config.get("mysql", "maxmind_read_username"), password=geoproc_cfg.db_password("maxmind_read_password_file"), db=geoproc_cfg.config.get("mysql", "maxmind_schema"), use_unicode=True ) lookupcur = lookupconn.cursor(cursor_class=geoproc_cfg.MySQLCursorDict) for locfield in ["country","region","city"]: with open("distinct_%s.txt" % locfield, "w") as outfile: lookupcur.execute(""" SELECT %(locfield)s FROM Distinct_%(locfield)s ; """ % {"locfield":locfield}) for row in lookupcur: outfile.write(row[locfield] + "\n")
def main(): global args parser = argparse.ArgumentParser(description="Analyze Bulk Extractor EXIF output for location indicators.") parser.add_argument("-d", "--debug", action="store_true", help="Enable debug printing (writes to stderr).") parser.add_argument("-r", "--regress", action="store_true", help="Run regression tests and exit.") args_regress = parser.parse_known_args()[0] ## Set up regular expressions for extracting desired EXIF tags relatref = re.compile(br"<Exif.GPSInfo.GPSLatitudeRef>(?P<GPSLatitudeRef>[NS])\</Exif.GPSInfo.GPSLatitudeRef>") relongref = re.compile(br"<Exif.GPSInfo.GPSLongitudeRef>(?P<GPSLongitudeRef>[EW])</Exif.GPSInfo.GPSLongitudeRef>") relat = re.compile(br"<Exif.GPSInfo.GPSLatitude>(?P<GPSLatitude>[0-9\-/ ]{1,40})</Exif.GPSInfo.GPSLatitude>") relong = re.compile(br"<Exif.GPSInfo.GPSLongitude>(?P<GPSLongitude>[0-9\-/ ]{1,40})</Exif.GPSInfo.GPSLongitude>") retimestamp = re.compile(br"<Exif.GPSInfo.GPSTimeStamp>(?P<GPSTimeStamp>[0-9/ ]{1,40})</Exif.GPSInfo.GPSTimeStamp>") redatestamp = re.compile(br"<Exif.GPSInfo.GPSDateStamp>(?P<GPSDateStamp>[0-9: .]{1,40})</Exif.GPSInfo.GPSDateStamp>") redatetime = re.compile(br"<Exif.Image.DateTime>(?P<DateTime>[0-9: .]{1,40})</Exif.Image.DateTime>") if args_regress.regress: assert round(dms_to_decimal(b"33/1 49/1 42/1"), 4) == 33.8283 assert round(dms_to_decimal(b"33/1 49/1 0/1"), 4) == round(dms_to_decimal(b"33/1 49/1"), 4) assert dms_to_decimal(b"0/0 0/0 0/0") is None #This value was observed in the Real Data Corpus. #TODO assert hms_fraction_to_decimal(b"8/1 49/1 18/1") == "8:49:18" #Sample EXIF data supplied by m57-redacted-terry-2009-12-07.aff, image offset 2116743168, pretty-printed with xmllint test_exif = b"""<?xml version="1.0"?> <exif> <width>48</width> <height>48</height> <Exif.Image.Make>Apple</Exif.Image.Make> <Exif.Image.Model>iPhone</Exif.Image.Model> <Exif.Image.XResolution>72/1</Exif.Image.XResolution> <Exif.Image.YResolution>72/1</Exif.Image.YResolution> <Exif.Image.ResolutionUnit>2</Exif.Image.ResolutionUnit> <Exif.Image.DateTime>2008:11:26 11:46:56</Exif.Image.DateTime> <Exif.Image.ExifTag>180</Exif.Image.ExifTag> <Exif.Photo.FNumber>14/5</Exif.Photo.FNumber> <Exif.Photo.DateTimeOriginal>2008:11:26 11:46:56</Exif.Photo.DateTimeOriginal> <Exif.Photo.DateTimeDigitized>2008:11:26 11:46:56</Exif.Photo.DateTimeDigitized> <Exif.Photo.ColorSpace>1</Exif.Photo.ColorSpace> <Exif.Photo.PixelXDimension>1200</Exif.Photo.PixelXDimension> <Exif.Photo.PixelYDimension>1600</Exif.Photo.PixelYDimension> <Exif.Image.GPSTag>306</Exif.Image.GPSTag> <Exif.GPSInfo.GPSLatitudeRef>N</Exif.GPSInfo.GPSLatitudeRef> <Exif.GPSInfo.GPSLatitude>38/1 5354/100 0/1</Exif.GPSInfo.GPSLatitude> <Exif.GPSInfo.GPSLongitudeRef>W</Exif.GPSInfo.GPSLongitudeRef> <Exif.GPSInfo.GPSLongitude>92/1 2343/100 0/1</Exif.GPSInfo.GPSLongitude> <Exif.GPSInfo.GPSTimeStamp>11/1 46/1 788/100</Exif.GPSInfo.GPSTimeStamp> <Exif.Image.0xa500>11/5</Exif.Image.0xa500> <Exif.Thumbnail.Compression>6</Exif.Thumbnail.Compression> <Exif.Thumbnail.Orientation>6</Exif.Thumbnail.Orientation> <Exif.Thumbnail.XResolution>72/1</Exif.Thumbnail.XResolution> <Exif.Thumbnail.YResolution>72/1</Exif.Thumbnail.YResolution> <Exif.Thumbnail.ResolutionUnit>2</Exif.Thumbnail.ResolutionUnit> <Exif.Thumbnail.JPEGInterchangeFormat>550</Exif.Thumbnail.JPEGInterchangeFormat> <Exif.Thumbnail.JPEGInterchangeFormatLength>11682</Exif.Thumbnail.JPEGInterchangeFormatLength> </exif>""" assert not relat.search(test_exif) is None assert not relong.search(test_exif) is None assert relatref.search(test_exif).group("GPSLatitudeRef") == b"N" assert relongref.search(test_exif).group("GPSLongitudeRef") == b"W" exit(0) parser.add_argument("-a", "--anno", help="Annotation database of Fiwalk and TSK-db") parser.add_argument("exif_file", type=argparse.FileType('rb'), help="Bulk Extractor exif.txt") args = parser.parse_args() dprint("Debug: args.anno = %r.\n" % args.anno) ## Connect to db cfg = geoproc_cfg.config refconn = mysql.connector.Connect( host=cfg.get("mysql", "maxmind_server"), user=cfg.get("mysql", "maxmind_read_username"), password=geoproc_cfg.db_password("maxmind_read_password_file"), db=cfg.get("mysql", "maxmind_schema"), use_unicode=True ) if refconn is None: raise Exception("Error: Could not define lookup cursor.") refcur = refconn.cursor(cursor_class=geoproc_cfg.MySQLCursorDict) ## Connect to output db outconn = sqlite3.connect("exif_headers_votes.db") outconn.isolation_level = "EXCLUSIVE" outconn.row_factory = sqlite3.Row outcur = outconn.cursor() outcur.execute(SQL_CREATE_EXIF_HEADERS_VOTES) ## Connect to anno db if available annoconn, annocur = geoproc_library.connect_to_fs_anno_db(args.anno) for binary_line in args.exif_file: binary_line_parts = binary_line.split(b"\t") if len(binary_line_parts) < 3: #We don't even have exif data. Skip. continue recdict = dict() recdict["forensic_path"] = str(binary_line_parts[0], "ascii") exif_data = binary_line_parts[2] match_exif_gps_lat = relat.search(exif_data) match_exif_gps_lon = relong.search(exif_data) #The above matches are essential if None in [match_exif_gps_lat, match_exif_gps_lon]: continue exif_gps_lat_decimal = dms_to_decimal(match_exif_gps_lat.group("GPSLatitude")) exif_gps_lon_decimal = dms_to_decimal(match_exif_gps_lon.group("GPSLongitude")) try: if not None in [exif_gps_lat_decimal, exif_gps_lon_decimal]: recdict["exif_gps_lat"] = round(exif_gps_lat_decimal, 4) recdict["exif_gps_lon"] = round(exif_gps_lon_decimal, 4) except IndexError: #Didn't find lat or long content. Warn and continue. sys.stderr.write("Warning: Couldn't find a lat (maybe long) from these matches:\n\t%r\n\t%r\n" % (match_exif_gps_lat.group(0), match_exif_gps_lon.group(0))) #This script's only purpose is finding lat/longs if None in [recdict.get("exif_gps_lat"), recdict.get("exif_gps_lon")]: continue #Lat/long references, we can guess: Default to N,E. match_exif_gps_latref = relatref.search(exif_data) match_exif_gps_longref = relongref.search(exif_data) exif_gps_latref = b"N" if match_exif_gps_latref: exif_gps_latref = match_exif_gps_latref.group("GPSLatitudeRef") exif_gps_longref = b"E" if match_exif_gps_longref: exif_gps_longref = match_exif_gps_longref.group("GPSLongitudeRef") if exif_gps_latref == b"S": recdict["exif_gps_lat"] *= -1 if exif_gps_longref == b"W": recdict["exif_gps_lon"] *= -1 #Times, we can guess from the file if we really need to. match_exif_timestamp = retimestamp.search(exif_data) if match_exif_timestamp: recdict["exif_gps_timestamp"] = hms_fraction_to_decimal(match_exif_timestamp.group("GPSTimeStamp")) match_exif_datestamp = redatestamp.search(exif_data) if match_exif_datestamp: recdict["exif_gps_datestamp"] = match_exif_datestamp.group("GPSDateStamp") match_exif_datetime = redatetime.search(exif_data) if match_exif_datetime: recdict["exif_datetime"] = match_exif_datetime.group("DateTime") #TODO integrate times into output refrecs = geoproc_library.latlongs_to_networked_locations(refcur, recdict["exif_gps_lat"], recdict["exif_gps_lon"], 30) if refrecs is None: recdict["database_queried"] = False else: recdict["database_queried"] = True #Get the nearest city within 30 miles if len(refrecs) > 0 and refrecs[0]["distance_miles"] < 30: refrec = refrecs[0] recdict["country"] = refrec["country"] recdict["region"] = refrec["region"] recdict["city"] = refrec["city"] recdict["postalCode"] = refrec["postalCode"] recdict["distance_miles"] = refrec["distance_miles"] #Note the name of the file containing this EXIF data, if available annorecs = geoproc_library.forensic_path_to_anno_recs(annocur, recdict["forensic_path"]) if annorecs and len(annorecs) > 0: for annorec in annorecs: outdict = copy.deepcopy(recdict) outdict["fs_obj_id"] = annorec.get("fs_obj_id") outdict["obj_id"] = annorec.get("obj_id") outdict["fiwalk_id"] = annorec.get("fiwalk_id") #Look at file system path and say if we think it's in a cache if outdict.get("obj_id"): annocur.execute(""" SELECT full_path FROM tsk_file_full_paths WHERE obj_id = ?; """, (outdict["obj_id"],)) pathrows = [row for row in annocur] if len(pathrows) == 1: outdict["file_in_web_cache"] = geoproc_library.path_in_web_cache(pathrows[0]["full_path"]) #Output geoproc_library.insert_db(outcur, "exif_headers_votes", outdict) else: #Output to database without owning-file annotations geoproc_library.insert_db(outcur, "exif_headers_votes", recdict) outconn.commit()
def main(): global args #Connect to anno db if available annoconn, annocur = geoproc_library.connect_to_fs_anno_db(args.anno) #Connect to db cfg = geoproc_cfg.config refconn = mysql.connector.Connect( host=cfg.get("mysql", "maxmind_server"), user=cfg.get("mysql", "maxmind_read_username"), password=geoproc_cfg.db_password("maxmind_read_password_file"), db=cfg.get("mysql", "maxmind_schema"), use_unicode=True ) if refconn is None: raise Exception("Error: Could not define lookup cursor.") refcur = refconn.cursor(cursor_class=geoproc_cfg.MySQLCursorDict) outconn = sqlite3.connect("ipv4s_votes.db") outconn.isolation_level = "EXCLUSIVE" outconn.row_factory = sqlite3.Row outcur = outconn.cursor() outcur.execute(SQL_CREATE_IPV4S_VOTES) pairing_dict = collections.defaultdict(list) ip_set = set([]) for (ipno, (forensic_path, ipv4, ipv4_notes)) in enumerate(geoproc_library.bulk_extractor_ips(args.be_dir)): pairing_dict[forensic_path].append((ipv4, ipv4_notes)) ip_set.add(ipv4) #Unfortunately, there isn't much to do for timestamps without file system or network time information. #TODO Add time interface dummy_dftime = dfxml.dftime("2009-05-01T00:00:00Z") ips_to_locs = geoproc_library.ips_to_locations(refcur, None, ip_set) for forensic_path in pairing_dict: #Determine if we have a pair entries_at_path = pairing_dict[forensic_path] pair_found = len(entries_at_path) == 2 for (ipv4, ipv4_notes) in entries_at_path: outdict = dict() outdict["believed_timestamp"] = dummy_dftime.iso8601() outdict["forensic_path"] = forensic_path outdict["ipv4"] = ipv4 outdict["ipv4_notes"] = ipv4_notes if "cksum-bad" in ipv4_notes: outdict["cksum_ok"] = False elif "cksum-ok" in ipv4_notes: outdict["cksum_ok"] = True #None, otherwise outdict["is_socket_address"] = "sockaddr" in ipv4_notes outdict["pair_found"] = pair_found if "(src)" in ipv4_notes: outdict["src_or_dst"] = "src" elif "dst" in ipv4_notes: outdict["src_or_dst"] = "dst" #None, otherwise annorecs = geoproc_library.forensic_path_to_anno_recs(annocur, outdict["forensic_path"]) if annorecs and len(annorecs) > 1: sys.stderr.write("Warning: Multiple files found to own forensic path %r. Only using first. This may cause strange results.\n" % outdict["forensic_path"]) if annorecs and len(annorecs) > 0: annorec = annorecs[0] outdict["obj_id"] = annorec.get("obj_id") outdict["fs_obj_id"] = annorec.get("fs_obj_id") outdict["fiwalk_id"] = annorec.get("fiwalk_id") if ipv4 in ips_to_locs: for key in [ "maxmind_ipv4_time", "country", "region", "city", "postalCode", "latitude", "longitude" ]: outdict[key] = ips_to_locs[ipv4][key] geoproc_library.insert_db(outcur, "ipv4s_votes", outdict) outconn.commit()
def main(): global args if os.path.isfile(args.output_db): if args.zap: os.remove(args.output_db) else: raise Exception("Output database already exists; aborting.") #Connect to location database to get names (necessary for wildcard matching, like if we just found a city) config = geoproc_cfg.config lookupconn = mdb.Connect( host=config.get("mysql", "maxmind_server"), user=config.get("mysql", "maxmind_read_username"), password=geoproc_cfg.db_password("maxmind_read_password_file"), db=config.get("mysql", "maxmind_schema"), use_unicode=True ) lookupcur = lookupconn.cursor(cursor_class=geoproc_cfg.MySQLCursorDict) #Maybe connect to ground truth gtconn = None gtcur = None if args.ground_truth: gtconn = sqlite3.connect(args.ground_truth) gtconn.row_factory = sqlite3.Row #Don't lock database gtcur = gtconn.cursor() results_dir_list = geoproc_library.get_results_dirs(args.input_root) dprint("Aggregating %d directories." % len(results_dir_list)) #Connect to output database outconn = sqlite3.connect(args.output_db) outconn.isolation_level = "EXCLUSIVE" outconn.row_factory = sqlite3.Row outcur = outconn.cursor() def add_columns(outcur, table_name): #Simple aggregate table: Just gets column for image_id outcur.execute("ALTER TABLE %s ADD COLUMN image_id TEXT;" % table_name) #Weighted aggregate table: Gets other columns to determine vote accuracy outcur.execute("CREATE TABLE %s_weighted AS SELECT * FROM %s;" % (table_name, table_name)) outcur.execute("ALTER TABLE %s_weighted ADD COLUMN number_possible_locations NUMBER" % table_name) for bcol in ["country", "region", "city", "location"]: outcur.execute("ALTER TABLE %s_weighted ADD COLUMN correct_%s NUMBER;" % (table_name, bcol)) outcur.execute(analyze_cookie_files.SQL_CREATE_COOKIE_FILES_VOTES) add_columns(outcur, "cookie_files_votes") outcur.execute(analyze_email_files.SQL_CREATE_EMAIL_FILES_VOTES) add_columns(outcur, "email_files_votes") outcur.execute(analyze_exif_headers.SQL_CREATE_EXIF_HEADERS_VOTES) add_columns(outcur, "exif_headers_votes") outcur.execute(analyze_ipv4s.SQL_CREATE_IPV4S_VOTES) add_columns(outcur, "ipv4s_votes") for results_dir in results_dir_list: try: ingest_table(outcur, lookupcur, gtcur, results_dir, "analyze_cookie_files.sh", "cookie_files_votes.db", "cookie_files_votes") ingest_table(outcur, lookupcur, gtcur, results_dir, "analyze_email_files.sh", "email_files_votes.db", "email_files_votes") ingest_table(outcur, lookupcur, gtcur, results_dir, "analyze_exif_headers.sh", "exif_headers_votes.db", "exif_headers_votes") ingest_table(outcur, lookupcur, gtcur, results_dir, "analyze_ipv4s.sh", "ipv4s_votes.db", "ipv4s_votes") except: dprint("Debug: Error occurred on results_dir %r." % results_dir) raise outconn.commit()
def main(): global args #Set up lookup database connection cfg = geoproc_cfg.config lookupconn = None lookupcur = None try: import mysql.connector as mdb lookupconn = mdb.connect( host=cfg.get("mysql", "maxmind_server"), user=cfg.get("mysql", "maxmind_read_username"), password=geoproc_cfg.db_password("maxmind_read_password_file"), db=cfg.get("mysql", "maxmind_schema"), use_unicode=True ) lookupcur = lookupconn.cursor(cursor_class=geoproc_cfg.MySQLCursorDict) except: sys.stderr.write("Warning: Could not connect to database. Proceeding without database support.\n") pass #Connect to annodb annoconn, annocur = geoproc_library.connect_to_fs_anno_db(args.annodb) #Verify input manifest_path = os.path.join(args.emaildir, "manifest.txt") if not os.path.isfile(manifest_path): raise Exception("Error: manifest.txt not found in input directory.") #Ingest BE ips, if available #Stash in (once-tested) histogram. #Dictionary key: ipv4 address #Dictionary value: (notes, tally) default dictionary. ip_notes_histogram = collections.defaultdict(lambda: collections.defaultdict(lambda: 0)) if args.bulk_extractor_output: for (forensic_path, ipv4, ipv4_notes) in geoproc_library.bulk_extractor_ips(args.bulk_extractor_output): ip_notes_histogram[ipv4][ipv4_notes] += 1 dprint("Debug: Number of IPv4s with notes: %d." % len(ip_notes_histogram.keys())) #Set up output database outdbpath = os.path.join(args.outdir, "email_files_votes.db") if os.path.isfile(outdbpath): raise Exception("Error: Output database already exists. This script won't overwrite. Aborting.") outconn = sqlite3.connect(outdbpath) outconn.isolation_level = "EXCLUSIVE" outconn.row_factory = sqlite3.Row outcur = outconn.cursor() outcur.execute(SQL_CREATE_EMAIL_FILES_VOTES) for (fiwalk_id, messageno, message) in emails_in_dir_manifest(manifest_path): dprint("Debug: Analyzing a record from fiwalk_id %r." % fiwalk_id) #print(repr(type(message))) #for i in message.keys(): # print('%r: %r' % (i, message.get_all(i))) received_recs = message.get_all("Received") if not received_recs: continue pathlength = len(received_recs) for (pathindex, pathline) in enumerate(received_recs): #TODO Just getting all the IPs for now; filter later ips = geoproc_library.all_ipv4s(pathline) dprint("Debug: Found this many IP's: %d.\n\t%r" % (len(ips), ips)) #Can we get a date? maybe_timestamp = None maybe_timestamp_match = dfxml.rx_rfc822datetime.search(pathline) if maybe_timestamp_match: thestring = maybe_timestamp_match.string thespan = maybe_timestamp_match.span() thedatestring = thestring[thespan[0]:thespan[1]] try: maybe_timestamp = dfxml.dftime(thedatestring) except: sys.stderr.write("Warning: An error occured trying to parse time input.\nInput:%r\nStack trace:\n" % thedatestring) sys.stderr.write(traceback.format_exc()) sys.stderr.write("\n") #Don't stop here. dprint("Debug: Believed timestamp: %r." % maybe_timestamp) #Now that we have a date, can we get locations? if maybe_timestamp: #Can we get a single recipient? (This is, of course, not guaranteed to be the owner.) sole_recipient = None delivered_to_headers = message.get_all("Delivered-To") to_headers = message.get_all("To") if delivered_to_headers and len(delivered_to_headers) == 1: sole_recipient = delivered_to_headers[0] elif to_headers and len(to_headers) == 1 and len(to_headers[0].split("\n")) == 1: sole_recipient = to_headers[0] all_ip_locations = geoproc_library.ips_to_locations(lookupcur, maybe_timestamp.datetime(), ips) dprint("Debug: Fetched these IP location records:\n\t%r" % all_ip_locations) for ip in ips: outdict = {"fiwalk_id":fiwalk_id} #TODO Use annodb to get TSK identifiers outdict["message_index"] = messageno outdict["ipv4"] = ip outdict["received_path_index"] = pathindex outdict["received_path_length"] = pathlength outdict["received_header_text"] = pathline outdict["database_queried"] = all_ip_locations is not None outdict["believed_timestamp"] = str(maybe_timestamp) outdict["sole_recipient_domain_is_webmail"] = geoproc_library.in_webmail_domain(sole_recipient) if all_ip_locations is not None and ip in all_ip_locations: rec = all_ip_locations[ip] outdict["latitude"] = rec.get("latitude") outdict["longitude"] = rec.get("longitude") outdict["postalCode"] = rec.get("postalCode") outdict["maxmind_ipv4_time"] = dfxml.dftime(rec.get("maxmind_ipv4_time")).iso8601() if rec.get("country"): outdict["country"] = rec["country"] if rec.get("region"): outdict["region"] = rec["region"] if rec.get("city"): outdict["city"] = rec["city"] dprint("Debug: Checking for IP notes for %r." % ip) if ip in ip_notes_histogram: dprint("Debug: Formatting notes for %r." % ip) notedict = ip_notes_histogram[ip] notelist = sorted(notedict.keys()) notes_to_format = [] for note in notelist: notes_to_format.append("%d %r" % (notedict[note], note)) outdict["ipv4_be_notes"] = "; ".join(notes_to_format) outdict["ipv4_be_has_cksum_or_socket"] = "sockaddr" in outdict["ipv4_be_notes"] or "cksum-ok" in outdict["ipv4_be_notes"] dprint("Debug: Outdict just before inserting:\n\t%r" % outdict) geoproc_library.insert_db(outcur, "email_files_votes", outdict) outconn.commit() dprint("Debug: Done.")
def main(): ## The program outline is embedded inline with double-hash comments (fit for grepping). ## Parse parameters parser = argparse.ArgumentParser(description="Create and populate MySQL database with MaxMind data.") parser.add_argument("-r", "--regress", action="store_true", dest="regress", help="Run unit tests and exit. Requires database connection.") parser.add_argument("-d", "--debug", action="store_true", dest="debug", help="Print debug information to stderr.") parser.add_argument("zips", help="Text file of MaxMind zips to ingest, one absolute path per line; or a directory containing the zips; or just one zip.") args = parser.parse_args() input_file_list = [] if args.zips.endswith(".zip"): input_file_list.append(os.path.abspath(os.path.expanduser(args.zips))) elif os.path.isdir(args.zips): for zipdirentry in os.listdir(args.zips): if zipdirentry.endswith(".zip"): input_file_list.append(os.path.abspath(os.path.join(args.zips, zipdirentry))) else: with open(args.zips, "r") as inputFile: for line in inputFile: maxmind_zip_path = line.strip() if not os.path.isabs(maxmind_zip_path): raise ValueError("Paths in ziplist file must be absolute.") input_file_list.append(maxmind_zip_path) ## Connect to db #Not using a try block - the whole point of this script is filling the database config = geoproc_cfg.config con = mdb.Connect( host=config.get("mysql", "maxmind_server"), user=config.get("mysql", "maxmind_write_username"), password=geoproc_cfg.db_password("maxmind_write_password_file"), db=config.get("mysql", "maxmind_schema"), use_unicode=True ) cur = con.cursor(cursor_class=geoproc_cfg.MySQLCursorDict) cur.execute("SET NAMES 'utf8';") ## Run regress tests? #TODO ## Create tables if non-existent if args.debug: sys.stderr.write("Creating tables...\n") #Datetime vs timestamp: <http://stackoverflow.com/a/409305/1207160> cur.execute(SQL_CREATE_ingest_audit_zips) cur.execute(SQL_CREATE_ingest_audit_locations) cur.execute(SQL_CREATE_ingest_audit_blocks) cur.execute(SQL_CREATE_LocationTable) cur.execute(SQL_CREATE_BlockTable) cur.execute(SQL_CREATE_LocationLatLongs) ## Record the number of location tables processed - non-zero triggers a table refresh later tally_location_updates = 0 ## Read table of processed zip files ## # Should contain: Zip file base name, full path to zip file, datetime started, datetime ended ## Read table of processed block files ## # Should contain: Zip file base name (foreign key), mtime, datetime started, datetime ended ## Read table of processed location files ## # Should contain: Zip file base name (foreign key), mtime, datetime started, datetime ended ## Read input list of zip files to ingest for maxmind_zip_path in input_file_list: maxmind_zip_basename = os.path.basename(maxmind_zip_path) ## For each zip in the list: ## If the zip had not finished processing previously: previously_finished_zip = False reset_this_zip = False cur.execute("SELECT * FROM ingest_audit_zips WHERE zip_basename = %s;", (maxmind_zip_basename,)) for row in cur.fetchall(): if row["datetime_end"] is None: #This row shouldn't be here at all. reset_this_zip = True else: previously_finished_zip = True if previously_finished_zip: sys.stderr.write("Warning: Zip file already read into database... Skipping zip file:" + maxmind_zip_path + "\n") continue ## If the block file had started and not finished previously: cur.execute("SELECT * FROM ingest_audit_blocks WHERE zip_basename = %s AND datetime_start IS NOT NULL AND datetime_end IS NULL;", (maxmind_zip_basename,)) rows = [row for row in cur.fetchall()] for row in rows: sys.stderr.write("Note: Found incomplete Block ingest for zip %r, started %r. Deleting records.\n" % (row["zip_basename"], row["datetime_start"])) ## Delete block records with matching mtime cur.execute("DELETE FROM BlockTable WHERE lastModifiedTime = %s;", (row["mtime"],)) if len(rows) > 0: cur.execute("DELETE FROM ingest_audit_blocks WHERE zip_basename = %s AND datetime_end IS NULL;", (maxmind_zip_basename,)) ## If the location file had started and not finished previously: cur.execute("SELECT * FROM ingest_audit_locations WHERE zip_basename = %s AND datetime_start IS NOT NULL AND datetime_end IS NULL;", (maxmind_zip_basename,)) ## Delete block records with matching mtime rows = [row for row in cur.fetchall()] for row in rows: sys.stderr.write("Note: Found incomplete Location ingest for zip %r, started %r. Deleting records.\n" % (row["zip_basename"], row["datetime_start"])) cur.execute("DELETE FROM LocationLatLongs WHERE lastModifiedTime = %s;", (row["mtime"],)) cur.execute("DELETE FROM LocationTable WHERE lastModifiedTime = %s;", (row["mtime"],)) if len(rows) > 0: cur.execute("DELETE FROM ingest_audit_locations WHERE zip_basename = %s AND datetime_end IS NULL;", (maxmind_zip_basename,)) ## Mark start time for processing zip starttime_zip = datetime.datetime.now() if reset_this_zip: cur.execute("DELETE FROM ingest_audit_zips WHERE zip_basename = %s;", (maxmind_zip_basename,)) #strftime appears to be the way to insert datetimes from Python to MySQL <http://stackoverflow.com/a/4508923/1207160> cur.execute("INSERT INTO ingest_audit_zips(zip_basename, zip_full_path, datetime_start) VALUES (%s,%s,%s);", (maxmind_zip_basename, maxmind_zip_path, datetime.datetime.strftime(starttime_zip, "%Y-%m-%d %H:%M:%S"))) con.commit() ## Get zip internal listing with zipfile.ZipFile(maxmind_zip_path, "r") as this_zip: zipped_blocks_name = None zipped_locations_name = None for zi in this_zip.infolist(): if zi.filename.endswith("Location.csv"): zipped_locations_name = zi.filename zipped_locations_time = datetime_from_ziptime(zi.date_time) elif zi.filename.endswith("Blocks.csv"): zipped_blocks_name = zi.filename zipped_blocks_time = datetime_from_ziptime(zi.date_time) ## If the location file had not finished processing previously: should_process_locations = False if zipped_locations_name is not None: zipped_locations_time_str = datetime.datetime.strftime(zipped_locations_time, "%Y-%m-%d %H:%M:%S") cur.execute("SELECT COUNT(*) AS tally FROM ingest_audit_locations WHERE zip_basename = %s AND datetime_end IS NOT NULL;", (maxmind_zip_basename,)) for rec in cur.fetchall(): should_process_locations = (rec["tally"] == 0) if should_process_locations: ## Mark start time starttime_locations = datetime.datetime.now() cur.execute("INSERT INTO ingest_audit_locations(zip_basename, mtime, datetime_start) VALUES (%s,%s,%s);", (maxmind_zip_basename, zipped_locations_time_str, datetime.datetime.strftime(starttime_locations, "%Y-%m-%d %H:%M:%S"))) con.commit() ## Ingest with this_zip.open(zipped_locations_name, "r") as locations_file: if args.debug: sys.stderr.write("Note: Ingesting %s / %s...\n" % (maxmind_zip_path, zipped_locations_name)) lineCount = 0 locations_file_wrapped = io.TextIOWrapper(locations_file, encoding="iso-8859-1") #MaxMind content encoding: http://dev.maxmind.com/geoip/csv locations_reader = csv.reader(locations_file_wrapped, delimiter=',', quoting=csv.QUOTE_ALL) for row in locations_reader: if len(row) == 9 and row[0].isdigit(): cur.execute("INSERT INTO LocationTable VALUES(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)", (int(row[0]),zipped_locations_time_str,row[1],row[2],row[3],row[4],row[5],row[6],row[7],row[8] )) lineCount = lineCount + 1 if lineCount % 10000 == 0 : if args.debug: sys.stderr.write("\tOn locations CSV record %d...\n" % lineCount) con.commit() con.commit() ## Update lat/long index if args.debug: sys.stderr.write("Note: Updating lat/long index.\n") cur.execute(""" INSERT INTO LocationLatLongs (latlong, locId, lastModifiedTime) SELECT POINT(latitude, longitude), locId, lastModifiedTime FROM LocationTable WHERE lastModifiedTime = %s ; """, (zipped_locations_time_str,)) con.commit() ## Mark end time endtime_locations = datetime.datetime.now() cur.execute("UPDATE ingest_audit_locations SET datetime_end = %s WHERE zip_basename = %s;", (datetime.datetime.strftime(endtime_locations, "%Y-%m-%d %H:%M:%S"), maxmind_zip_basename)) con.commit() tally_location_updates += 1 ## If the block file had not finished processing previously: should_process_blocks = False if zipped_blocks_name is not None: zipped_blocks_time_str = datetime.datetime.strftime(zipped_blocks_time, "%Y-%m-%d %H:%M:%S") cur.execute("SELECT COUNT(*) AS tally FROM ingest_audit_blocks WHERE zip_basename = %s AND datetime_end IS NOT NULL;", (maxmind_zip_basename,)) for rec in cur.fetchall(): should_process_blocks = (rec["tally"] == 0) if should_process_blocks: ## Mark start time starttime_blocks = datetime.datetime.now() cur.execute("INSERT INTO ingest_audit_blocks(zip_basename, mtime, datetime_start) VALUES (%s,%s,%s);", (maxmind_zip_basename, zipped_blocks_time_str, datetime.datetime.strftime(starttime_blocks, "%Y-%m-%d %H:%M:%S"))) con.commit() ## Ingest with this_zip.open(zipped_blocks_name, "r") as blocks_file: if args.debug: sys.stderr.write("Note: Ingesting %s / %s...\n" % (maxmind_zip_path, zipped_blocks_name)) lineCount = 0 blocks_file_wrapped = io.TextIOWrapper(blocks_file, encoding="iso-8859-1") #MaxMind content encoding: http://dev.maxmind.com/geoip/csv blocks_reader = csv.reader(blocks_file_wrapped, delimiter=',', quoting=csv.QUOTE_ALL) for row in blocks_reader: if len(row) == 3 and row[0].isdigit(): cur.execute("INSERT INTO BlockTable VALUES(%s,%s,%s,%s)", (row[0],row[1],row[2],zipped_blocks_time_str)) lineCount = lineCount + 1 if lineCount % 10000 == 0 : if args.debug: sys.stderr.write("\tOn blocks CSV record %d...\n" % lineCount) con.commit() con.commit() ## Mark end time endtime_blocks = datetime.datetime.now() cur.execute("UPDATE ingest_audit_blocks SET datetime_end = %s WHERE zip_basename = %s;", (datetime.datetime.strftime(endtime_blocks, "%Y-%m-%d %H:%M:%S"), maxmind_zip_basename)) con.commit() ## Mark end time endtime_blocks = datetime.datetime.now() cur.execute("UPDATE ingest_audit_zips SET datetime_end = %s WHERE zip_basename = %s;", (datetime.datetime.strftime(endtime_blocks, "%Y-%m-%d %H:%M:%S"), maxmind_zip_basename)) con.commit() #If we updated the LocationTable, refresh the list of all distinct location triples if tally_location_updates > 0: if args.debug: sys.stderr.write("Note: Refreshing DistinctLocations table.\n") cur.execute(SQL_DROP_DistinctLocations) cur.execute(SQL_CREATE_DistinctLocations) for locfield in ["country", "region", "city"]: formatdict = {"locfield":locfield} cur.execute("DROP TABLE IF EXISTS Distinct_%(locfield)s;" % formatdict) con.commit() cur.execute(""" CREATE TABLE Distinct_%(locfield)s AS SELECT DISTINCT %(locfield)s FROM DistinctLocations WHERE %(locfield)s IS NOT NULL AND %(locfield)s <> "" ORDER BY %(locfield)s ; """ % formatdict) con.commit() ## Done. if con: con.close()