def manual_download(captured_sha1): util.setup_socks() conn = util.connect_to_db() cursor = conn.cursor() # Database query to get the relevant recent record cursor.execute( """ SELECT dump_id,host,url,referer,client,server FROM pe_dumps WHERE sha1 = %s ORDER BY timestamp DESC;""", (captured_sha1, )) row = cursor.fetchone() dump_id = row[0] host = row[1] url = row[2] referer = row[3] client = row[4] server = row[5] full_url = "http://" ordered_host = server # if host is null, we use ther server IP if host: ordered_host = util.reorder_domain(host) full_url += ordered_host if url: full_url += url print "Starting manual download from :", full_url # Prepare the urllib2 request req = urllib2.Request(full_url) req.add_header("User-Agent", USER_AGENT) download_time = time.time() sha1, md5, different, is_interesting_file = download_file( dump_id, req, captured_sha1) # Database statement cursor.execute( """ INSERT INTO manual_download_checksums(dump_id, sha1, md5, different, referer_exists, timestamp, is_pe) VALUES (%s, %s, %s, %s, %s, TO_TIMESTAMP(%s), %s)""", (dump_id, sha1, md5, different, False, download_time, is_interesting_file)) cursor.close() conn.close()
def make_syslog_entry(cursor, dump_id, score): # Database query to get the relevant record cursor.execute(""" SELECT timestamp, client, server, dst_port, host, url, referer, pe.sha1, pe.md5, file_size, num_av_labels, corrupt, file_type FROM pe_dumps as pe LEFT JOIN virus_total_scans as vts USING(sha1) WHERE (corrupt = 'false' OR num_av_labels IS NOT NULL) AND dump_id = %s ORDER BY vts.query_time DESC """ % (dump_id, )) if cursor.rowcount == 0: return log_data = list(cursor.fetchone()) log_data[4] = reorder_domain(log_data[4]) # if a score!=None is passed as argument, use that score, otherwise retrieve it from DB report = "-" if score is not None: score = float( score ) # just to make sure we are dealing with real numbers and not a string ... if score > amico_threshold: report = "MALWARE" else: report = "BENIGN" report += "#%s#%s" % (score, amico_threshold) log_data.append(report) if log_data: #print log_data entry = ( "file download -- timestamp: %s, client_ip: %s, server_ip:" " %s, server_port: %s, host: %s, url: %s, referrer: %s, sha1: %s, md5:" " %s, file_size: %s, av_labels: %s, corrupt: %s, file_type: %s, amico_score: %s" % tuple(log_data)) # syslog.syslog(syslog.LOG_ALERT,q) syslog.syslog(syslog.LOG_WARNING | syslog.LOG_USER, entry)
def manual_download(captured_sha1): util.setup_socks() conn = util.connect_to_db() cursor = conn.cursor() # Database query to get the relevant recent record cursor.execute(""" SELECT dump_id,host,url,referer,client,server FROM pe_dumps WHERE sha1 = %s ORDER BY timestamp DESC;""", (captured_sha1,)) row = cursor.fetchone() dump_id = row[0] host = row[1] url = row[2] referer = row[3] client = row[4] server = row[5] if host is None: host = server ordered_host = util.reorder_domain(host) full_url = "http://" + ordered_host + url #print full_url # Prepare the urllib2 request req = urllib2.Request(full_url) req.add_header("User-Agent", USER_AGENT) download_time = time.time() sha1, md5, different, is_pe = download_file(dump_id, req, captured_sha1) # Database statement cursor.execute(""" INSERT INTO manual_download_checksums(dump_id, sha1, md5, different, referer_exists, timestamp, is_pe) VALUES (%s, %s, %s, %s, %s, TO_TIMESTAMP(%s), %s)""", (dump_id, sha1, md5, different, False, download_time, is_pe)) cursor.close() conn.close()
def make_syslog_entry(cursor, dump_id): # Database query to get the relevant record cursor.execute(""" SELECT timestamp, client, server, dst_port, host, url, referer, pe.sha1, pe.md5, file_size, trusted_av_labels, corrupt FROM pe_dumps as pe JOIN ped_vts_mapping as pvm USING(dump_id), virus_total_scans as vts WHERE dump_id = '%s' """ % (dump_id, )) if cursor.rowcount == 0: return log_data = list(cursor.fetchone()) log_data[4] = reorder_domain(log_data[4]) cursor.execute( """ SELECT score FROM amico_scores WHERE dump_id = %s """, (dump_id, )) report = "-" if cursor.rowcount > 0: score = cursor.fetchone()[0] if score is not None: if score > amico_threshold: report = "MALWARE" else: report = "BENIGN" report += "#%s#%s" % (score, amico_threshold) log_data.append(report) if log_data: #print log_data entry = ( "PE file download -- timestamp: %s, client_ip: %s, server_ip:" " %s, server_port: %s, host: %s, url: %s, referrer: %s, sha1: %s, md5:" " %s, file_size: %s, av_labels: %s, corrupt: %s, amico_score: %s" % tuple(log_data)) # syslog.syslog(syslog.LOG_ALERT,q) syslog.syslog(syslog.LOG_WARNING | syslog.LOG_USER, entry)
def make_syslog_entry(cursor, dump_id): # Database query to get the relevant record cursor.execute(""" SELECT timestamp, client, server, dst_port, host, url, referer, pe.sha1, pe.md5, file_size, trusted_av_labels, corrupt FROM pe_dumps as pe JOIN ped_vts_mapping as pvm USING(dump_id), virus_total_scans as vts WHERE dump_id = '%s' """ % (dump_id,)) if cursor.rowcount == 0: return log_data = list(cursor.fetchone()) log_data[4] = reorder_domain(log_data[4]) cursor.execute(""" SELECT score FROM amico_scores WHERE dump_id = %s """, (dump_id, )) report = "-" if cursor.rowcount > 0: score = cursor.fetchone()[0] if score is not None: if score > amico_threshold: report = "MALWARE" else: report = "BENIGN" report += "#%s#%s" % (score, amico_threshold) log_data.append(report) if log_data: #print log_data entry = ("PE file download -- timestamp: %s, client_ip: %s, server_ip:" " %s, server_port: %s, host: %s, url: %s, referrer: %s, sha1: %s, md5:" " %s, file_size: %s, av_labels: %s, corrupt: %s, amico_score: %s" % tuple(log_data)) # syslog.syslog(syslog.LOG_ALERT,q) syslog.syslog(syslog.LOG_WARNING | syslog.LOG_USER, entry)
def db_pe_dumps(file_path, sha1, md5, file_size): # print "Time b4 http parsing: %f" %(time.time(),) # Use Autocommit mode for database connection conn = util.connect_to_db() cursor = conn.cursor() fileHandle = open(file_path) # Timestamp r = re.compile("[0-9]+") timestamp = r.search(fileHandle.readline()) if timestamp is not None: timestamp = timestamp.group() # print timestamp.group() # Source and Destination IPs r = re.compile("([0-9.]+):.*-([0-9.]+):([0-9]+)-.*") ip = r.search(fileHandle.readline()) if ip is not None: srcip = ip.group(2) dstip = ip.group(1) dst_port = ip.group(3) # print ip.group(1) # print ip.group(2) else: srcip = None dstip = None dst_port = None # URL r = re.compile("(GET|POST|HEAD) (.*) ") url = r.search(fileHandle.readline()) if url is not None: method = url.group(1) method = method[:10] url = url.group(2) # print url.group(1) else: method = None # Host r = re.compile("Host: (.*)") host = r.search(fileHandle.readline()) if host is not None: host = host.group(1) host = util.reorder_domain(host.strip()) # print host.group(1) # Referer r = re.compile("Referer: (.*)") referer = r.search(fileHandle.readline()) if referer is not None: referer = referer.group(1) # print referrer.group(1) # CORRUPT_PE corrupt_pe = False r = re.compile("CORRUPT_(PE|FILE)") corrupt_pe_str = r.search(fileHandle.readline()) if corrupt_pe_str is not None: corrupt_pe = True # Now, parse data from the response # Server data = fileHandle.read() r = re.compile("Server: (.*)") server = r.search(data) if server is not None: server = server.group(1) server = server.rstrip("\r") server = server[:64] # Content-Type r = re.compile("Content-Type: (.*)") cont_type = r.search(data) if cont_type is not None: cont_type = cont_type.group(1) cont_type = cont_type.rstrip("\r") cont_type = cont_type[:128] # print "Time after http parsing: %f" %(time.time(),) # Database statement cursor.execute( """ INSERT INTO pe_dumps(sha1,md5,timestamp,server,client,method,url,host, referer,server_application,content_type,dst_port,corrupt,file_size) VALUES (%s,%s,TO_TIMESTAMP(%s),%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)""", ( sha1, md5, timestamp, srcip, dstip, method, url, host, referer, server, cont_type, dst_port, corrupt_pe, file_size, ), ) cursor.execute( """ SELECT dump_id FROM pe_dumps where sha1 = %s ORDER BY dump_id DESC """, (sha1,), ) dump_id = cursor.fetchone()[0] print("A new entry on host:%s has been made in pe_dumps table with " "dump_id %s" % (host, dump_id)) fileHandle.close() cursor.close() conn.close() return dump_id, corrupt_pe
def insert_twold_based_features(cursor, dump_id): cursor.execute( """ SELECT host FROM pe_dumps where dump_id = %s""", (dump_id, )) row = cursor.fetchone() try: # ok because AND clauses are evaluated left to right if row is not None and row[0]: host = util.reorder_domain(row[0]) twold = util.extract_twold(host) twold = util.reorder_domain(twold) twold += '%' else: print "host is None!" return except Exception as e: # capturing known causes if util.is_ip(host): twold = row[0] else: print "Error in extracting 2LD!, ", e, host, dump_id return cursor.execute( """ SELECT COUNT(DISTINCT dump_id) FROM pe_dumps AS pe WHERE pe.host LIKE %s AND pe.dump_id < %s AND pe.dump_id > %s""", (twold, dump_id, dump_id - MAX_PAST_DUMPS)) twold_total_downloads = cursor.fetchone()[0] # Disabled vt_month_shelf due to the 403 error from VT #cursor.execute(""" # SELECT count(distinct dump_id) from pe_dumps as pe JOIN # weka_features as f using (dump_id) # where f.raw_dump_num_av_labels = 0 and f.vt_month_shelf = 't' and # pe.host like %s and pe.dump_id < %s """, # (twold, dump_id)) cursor.execute( """ SELECT COUNT(DISTINCT dump_id) FROM pe_dumps AS pe JOIN ped_vts_mapping AS pvm USING (dump_id), virus_total_scans AS vts WHERE vts.num_av_labels = 0 AND pe.host LIKE %s AND pe.dump_id < %s AND pe.dump_id > %s AND vts.vt_id = pvm.vt_id""", (twold, dump_id, dump_id - MAX_PAST_DUMPS)) twold_benign_downloads = cursor.fetchone()[0] cursor.execute( """ SELECT COUNT(DISTINCT dump_id) FROM pe_dumps AS pe JOIN ped_vts_mapping AS pvm USING (dump_id), virus_total_scans AS vts WHERE vts.trusted_av_labels > 1 AND pe.host LIKE %s AND pe.dump_id < %s AND pe.dump_id > %s AND vts.vt_id = pvm.vt_id""", (twold, dump_id, dump_id - MAX_PAST_DUMPS)) twold_malware_downloads = cursor.fetchone()[0] cursor.execute( """ SELECT COUNT(DISTINCT dump_id) FROM pe_dumps AS pe JOIN ped_vts_mapping AS pvm USING (dump_id), virus_total_scans AS vts WHERE vts.num_av_labels > 1 AND pe.host LIKE %s AND pe.dump_id < %s AND pe.dump_id > %s AND vts.vt_id = pvm.vt_id""", (twold, dump_id, dump_id - MAX_PAST_DUMPS)) twold_suspicious_downloads = cursor.fetchone()[0] if twold_total_downloads == 0: twold_benign_ratio = None twold_malware_ratio = None twold_suspicious_ratio = None else: twold_benign_ratio = float( twold_benign_downloads) / twold_total_downloads twold_malware_ratio = float( twold_malware_downloads) / twold_total_downloads twold_suspicious_ratio = float( twold_suspicious_downloads) / twold_total_downloads # The averages are over distinct sha1s cursor.execute( """ SELECT AVG(num_av_labels), AVG(trusted_av_labels) FROM (SELECT pe.sha1, MAX(dump_id) AS max_id FROM pe_dumps AS pe WHERE pe.host LIKE %s AND pe.dump_id < %s AND pe.dump_id > %s AND pe.corrupt = 'f' GROUP BY pe.sha1) as a JOIN (SELECT p.sha1, num_av_labels, trusted_av_labels, dump_id FROM pe_dumps AS p JOIN ped_vts_mapping as pvm USING (dump_id), virus_total_scans as vts WHERE pvm.vt_id = vts.vt_id AND p.host LIKE %s AND dump_id < %s AND dump_id > %s AND p.corrupt='f') as b ON a.max_id = b.dump_id WHERE num_av_labels IS NOT NULL""", (twold, dump_id, dump_id - MAX_PAST_DUMPS, twold, dump_id, dump_id - MAX_PAST_DUMPS)) if cursor.rowcount == 0: twold_avg_av_labels = None twold_avg_trusted_labels = None else: twold_avg_av_labels, twold_avg_trusted_labels = cursor.fetchone() # the oldest scan report is used to get the # of unknown hashes # to remove any bias due to VT submissions cursor.execute( """ SELECT COUNT(DISTINCT b.sha1) FROM (SELECT pe.sha1, MIN(dump_id) AS min_id FROM pe_dumps AS pe WHERE pe.host LIKE %s AND pe.dump_id < %s AND pe.dump_id > %s AND pe.corrupt = 'f' GROUP BY pe.sha1) as a JOIN (SELECT p.sha1, num_av_labels, trusted_av_labels, dump_id FROM pe_dumps AS p JOIN ped_vts_mapping as pvm USING (dump_id), virus_total_scans as vts WHERE pvm.vt_id = vts.vt_id AND p.host LIKE %s AND dump_id < %s AND dump_id > %s AND p.corrupt='f') as b ON a.min_id = b.dump_id WHERE num_av_labels IS NULL""", (twold, dump_id, dump_id - MAX_PAST_DUMPS, twold, dump_id, dump_id - MAX_PAST_DUMPS)) twold_unknown_hashes = cursor.fetchone()[0] cursor.execute( """ SELECT COUNT(DISTINCT pe.sha1) FROM pe_dumps AS pe WHERE pe.host LIKE %s AND pe.corrupt = 'f' AND pe.dump_id < %s AND pe.dump_id > %s """, (twold, dump_id, dump_id - MAX_PAST_DUMPS)) twold_total_hashes = cursor.fetchone()[0] if twold_total_hashes != 0: twold_unknown_hash_ratio = float( twold_unknown_hashes) / twold_total_hashes else: twold_unknown_hash_ratio = None try: cursor.execute( """ UPDATE weka_features set twold_benign_downloads = %s, twold_malware_downloads = %s, twold_suspicious_downloads = %s, twold_total_downloads = %s, twold_malware_ratio = %s, twold_suspicious_ratio = %s, twold_benign_ratio = %s, twold_avg_av_labels = %s, twold_avg_trusted_labels = %s, twold_unknown_hashes = %s, twold_total_hashes = %s, twold_unknown_hash_ratio = %s where dump_id = %s """, (twold_benign_downloads, twold_malware_downloads, twold_suspicious_downloads, twold_total_downloads, twold_malware_ratio, twold_suspicious_ratio, twold_benign_ratio, twold_avg_av_labels, twold_avg_trusted_labels, twold_unknown_hashes, twold_total_hashes, twold_unknown_hash_ratio, dump_id)) except Exception as e: print e print "Could not insert twold based features for the dump #", dump_id
def insert_hts_based_features(cursor, dump_id): """ Computes host/2ld/server-based features for a given download Arguments: cursor: DB cursort from existing DB connection dump_id: id of download to be classified """ # also query for timestamp, so we can use to limit how much we go back in time! query = " SELECT host,server,DATE(timestamp) FROM pe_dumps WHERE dump_id = %s " cursor.execute(query, (dump_id, )) row = cursor.fetchone() if not row: return (host, server, date) = row domain = util.reorder_domain(host) twold = util.reorder_domain(util.extract_twold(domain)) twold_like = '-NONE-' # avoids any matching in "pe.host LIKE %s" in the query below if twold is None: if not host is None: twold = host twold_like = twold + '.%' query = """ SELECT dump_id,pe.sha1,pe.host,pe.server,trusted_av_labels,num_av_labels FROM pe_dumps AS pe JOIN ped_vts_mapping AS pvm USING(dump_id) JOIN virus_total_scans AS vts USING(vt_id) WHERE pe.corrupt = 'f' AND (pe.host = %s OR pe.host LIKE %s OR pe.server = %s) AND pe.dump_id < %s AND pe.dump_id > %s AND pe.timestamp > %s """ cursor.execute(query, (host, twold_like, server, dump_id, dump_id - MAX_PAST_DUMPS, date - timedelta(days=MAX_PAST_DAYS))) tuples = cursor.fetchall() # make the results into a pandas data frame if not tuples: df = ps.DataFrame( index=[], columns=['dump_id', 'sha1', 'host', 'server', 'tavs', 'navs']) else: df = ps.DataFrame.from_records(tuples) df.columns = ['dump_id', 'sha1', 'host', 'server', 'tavs', 'navs'] ### compute twold-based features df_twold = df[df['host'].str.startswith(twold) == True] twold_v = compute_features_hts(df_twold) ### compute host-based features df_host = df[df.host == host] host_v = compute_features_hts(df_host) ### compute server-based features df_server = df[df.server == server] server_v = compute_features_hts(df_server) twold_features = (twold_v['benign_downloads'], twold_v['malware_downloads'], twold_v['suspicious_downloads'], twold_v['total_downloads'], twold_v['malware_ratio'], twold_v['suspicious_ratio'], twold_v['benign_ratio'], twold_v['avg_av_labels'], twold_v['avg_trusted_labels'], twold_v['unknown_hashes'], twold_v['total_hashes'], twold_v['unknown_hash_ratio']) host_features = (host_v['benign_downloads'], host_v['malware_downloads'], host_v['suspicious_downloads'], host_v['total_downloads'], host_v['malware_ratio'], host_v['suspicious_ratio'], host_v['benign_ratio'], host_v['avg_av_labels'], host_v['avg_trusted_labels'], host_v['unknown_hashes'], host_v['total_hashes'], host_v['unknown_hash_ratio']) server_features = (server_v['benign_downloads'], server_v['malware_downloads'], server_v['suspicious_downloads'], server_v['total_downloads'], server_v['malware_ratio'], server_v['suspicious_ratio'], server_v['benign_ratio'], server_v['avg_av_labels'], server_v['avg_trusted_labels'], server_v['unknown_hashes'], server_v['total_hashes'], server_v['unknown_hash_ratio']) query = """ UPDATE weka_features SET twold_benign_downloads = %s, twold_malware_downloads = %s, twold_suspicious_downloads = %s, twold_total_downloads = %s, twold_malware_ratio = %s, twold_suspicious_ratio = %s, twold_benign_ratio = %s, twold_avg_av_labels = %s, twold_avg_trusted_labels = %s, twold_unknown_hashes = %s, twold_total_hashes = %s, twold_unknown_hash_ratio = %s, host_benign_downloads = %s, host_malware_downloads = %s, host_suspicious_downloads = %s, host_total_downloads = %s, host_malware_ratio = %s, host_suspicious_ratio = %s, host_benign_ratio = %s, host_avg_av_labels = %s, host_avg_trusted_labels = %s, host_unknown_hashes = %s, host_total_hashes = %s, host_unknown_hash_ratio = %s, server_ip_benign_downloads = %s, server_ip_malware_downloads = %s, server_ip_suspicious_downloads = %s, server_ip_total_downloads = %s, server_ip_malware_ratio = %s, server_ip_suspicious_ratio = %s, server_ip_benign_ratio = %s, server_ip_avg_av_labels = %s, server_ip_avg_trusted_labels = %s, server_ip_unknown_hashes = %s, server_ip_total_hashes = %s, server_ip_unknown_hash_ratio = %s where dump_id = %s """ try: cursor.execute( query, twold_features + host_features + server_features + (dump_id, )) except Exception as e: print e print "Could not insert server-based features for the dump #", dump_id
def insert_twold_based_features(cursor, dump_id): cursor.execute(""" SELECT host FROM pe_dumps where dump_id = %s""", (dump_id, )) row = cursor.fetchone() try: # ok because AND clauses are evaluated left to right if row is not None and row[0]: host = util.reorder_domain(row[0]) twold = util.extract_twold(host) twold = util.reorder_domain(twold) twold += '%' else: print "host is None!" return except Exception as e: # capturing known causes if util.is_ip(host): twold = row[0] else: print "Error in extracting 2LD!, ", e, host, dump_id return cursor.execute(""" SELECT COUNT(DISTINCT dump_id) FROM pe_dumps AS pe WHERE pe.host LIKE %s AND pe.dump_id < %s AND pe.dump_id > %s""", (twold, dump_id, dump_id-MAX_PAST_DUMPS)) twold_total_downloads = cursor.fetchone()[0] # Disabled vt_month_shelf due to the 403 error from VT #cursor.execute(""" # SELECT count(distinct dump_id) from pe_dumps as pe JOIN # weka_features as f using (dump_id) # where f.raw_dump_num_av_labels = 0 and f.vt_month_shelf = 't' and # pe.host like %s and pe.dump_id < %s """, # (twold, dump_id)) cursor.execute(""" SELECT COUNT(DISTINCT dump_id) FROM pe_dumps AS pe JOIN ped_vts_mapping AS pvm USING (dump_id), virus_total_scans AS vts WHERE vts.num_av_labels = 0 AND pe.host LIKE %s AND pe.dump_id < %s AND pe.dump_id > %s AND vts.vt_id = pvm.vt_id""", (twold, dump_id, dump_id-MAX_PAST_DUMPS)) twold_benign_downloads = cursor.fetchone()[0] cursor.execute(""" SELECT COUNT(DISTINCT dump_id) FROM pe_dumps AS pe JOIN ped_vts_mapping AS pvm USING (dump_id), virus_total_scans AS vts WHERE vts.trusted_av_labels > 1 AND pe.host LIKE %s AND pe.dump_id < %s AND pe.dump_id > %s AND vts.vt_id = pvm.vt_id""", (twold, dump_id, dump_id-MAX_PAST_DUMPS)) twold_malware_downloads = cursor.fetchone()[0] cursor.execute(""" SELECT COUNT(DISTINCT dump_id) FROM pe_dumps AS pe JOIN ped_vts_mapping AS pvm USING (dump_id), virus_total_scans AS vts WHERE vts.num_av_labels > 1 AND pe.host LIKE %s AND pe.dump_id < %s AND pe.dump_id > %s AND vts.vt_id = pvm.vt_id""", (twold, dump_id, dump_id-MAX_PAST_DUMPS)) twold_suspicious_downloads = cursor.fetchone()[0] if twold_total_downloads == 0: twold_benign_ratio = None twold_malware_ratio = None twold_suspicious_ratio = None else: twold_benign_ratio = float(twold_benign_downloads) / twold_total_downloads twold_malware_ratio = float(twold_malware_downloads) / twold_total_downloads twold_suspicious_ratio = float(twold_suspicious_downloads) / twold_total_downloads # The averages are over distinct sha1s cursor.execute(""" SELECT AVG(num_av_labels), AVG(trusted_av_labels) FROM (SELECT pe.sha1, MAX(dump_id) AS max_id FROM pe_dumps AS pe WHERE pe.host LIKE %s AND pe.dump_id < %s AND pe.dump_id > %s AND pe.corrupt = 'f' GROUP BY pe.sha1) as a JOIN (SELECT p.sha1, num_av_labels, trusted_av_labels, dump_id FROM pe_dumps AS p JOIN ped_vts_mapping as pvm USING (dump_id), virus_total_scans as vts WHERE pvm.vt_id = vts.vt_id AND p.host LIKE %s AND dump_id < %s AND dump_id > %s AND p.corrupt='f') as b ON a.max_id = b.dump_id WHERE num_av_labels IS NOT NULL""", (twold, dump_id, dump_id-MAX_PAST_DUMPS, twold, dump_id, dump_id-MAX_PAST_DUMPS)) if cursor.rowcount == 0: twold_avg_av_labels = None twold_avg_trusted_labels = None else: twold_avg_av_labels, twold_avg_trusted_labels = cursor.fetchone() # the oldest scan report is used to get the # of unknown hashes # to remove any bias due to VT submissions cursor.execute(""" SELECT COUNT(DISTINCT b.sha1) FROM (SELECT pe.sha1, MIN(dump_id) AS min_id FROM pe_dumps AS pe WHERE pe.host LIKE %s AND pe.dump_id < %s AND pe.dump_id > %s AND pe.corrupt = 'f' GROUP BY pe.sha1) as a JOIN (SELECT p.sha1, num_av_labels, trusted_av_labels, dump_id FROM pe_dumps AS p JOIN ped_vts_mapping as pvm USING (dump_id), virus_total_scans as vts WHERE pvm.vt_id = vts.vt_id AND p.host LIKE %s AND dump_id < %s AND dump_id > %s AND p.corrupt='f') as b ON a.min_id = b.dump_id WHERE num_av_labels IS NULL""", (twold, dump_id, dump_id-MAX_PAST_DUMPS, twold, dump_id, dump_id-MAX_PAST_DUMPS)) twold_unknown_hashes = cursor.fetchone()[0] cursor.execute(""" SELECT COUNT(DISTINCT pe.sha1) FROM pe_dumps AS pe WHERE pe.host LIKE %s AND pe.corrupt = 'f' AND pe.dump_id < %s AND pe.dump_id > %s """, (twold, dump_id, dump_id-MAX_PAST_DUMPS)) twold_total_hashes = cursor.fetchone()[0] if twold_total_hashes != 0: twold_unknown_hash_ratio = float(twold_unknown_hashes) / twold_total_hashes else: twold_unknown_hash_ratio = None try: cursor.execute(""" UPDATE weka_features set twold_benign_downloads = %s, twold_malware_downloads = %s, twold_suspicious_downloads = %s, twold_total_downloads = %s, twold_malware_ratio = %s, twold_suspicious_ratio = %s, twold_benign_ratio = %s, twold_avg_av_labels = %s, twold_avg_trusted_labels = %s, twold_unknown_hashes = %s, twold_total_hashes = %s, twold_unknown_hash_ratio = %s where dump_id = %s """, (twold_benign_downloads, twold_malware_downloads, twold_suspicious_downloads, twold_total_downloads, twold_malware_ratio, twold_suspicious_ratio, twold_benign_ratio, twold_avg_av_labels, twold_avg_trusted_labels, twold_unknown_hashes, twold_total_hashes, twold_unknown_hash_ratio, dump_id)) except Exception as e: print e print "Could not insert twold based features for the dump #", dump_id
def db_file_dumps(file_path, sha1, md5, file_size, file_type): #print "Time b4 http parsing: %f" %(time.time(),) # Use Autocommit mode for database connection conn = util.connect_to_db() cursor = conn.cursor() fileHandle = open(file_path) # Timestamp r = re.compile('[0-9]+') timestamp = r.search(fileHandle.readline()) if timestamp is not None: timestamp = timestamp.group() #print timestamp.group() # Source and Destination IPs r = re.compile('([0-9.]+):.*-([0-9.]+):([0-9]+)-.*') ip = r.search(fileHandle.readline()) if ip is not None: srcip = ip.group(2) dstip = ip.group(1) dst_port = ip.group(3) #print ip.group(1) #print ip.group(2) else: srcip = None dstip = None dst_port = None # URL r = re.compile('(GET|POST|HEAD) (.*) ') url = r.search(fileHandle.readline()) if url is not None: method = url.group(1) method = method[:10] url = url.group(2) #print url.group(1) else: method = None # Host r = re.compile('Host: (.*)') host = r.search(fileHandle.readline()) if host is not None: host = host.group(1) host = util.reorder_domain(host.strip()) #print host.group(1) # Referer r = re.compile('Referer: (.*)') referer = r.search(fileHandle.readline()) if referer is not None: referer = referer.group(1) #print referrer.group(1) # CORRUPT_PE corrupt_pe = False r = re.compile('CORRUPT_FILE') corrupt_pe_str = r.search(fileHandle.readline()) if corrupt_pe_str is not None: corrupt_pe = True # Now, parse data from the response # Server data = fileHandle.read() r = re.compile('Server: (.*)') server = r.search(data) if server is not None: server = server.group(1) server = server.rstrip('\r') server = server[:64] # Content-Type r = re.compile('Content-Type: (.*)') cont_type = r.search(data) if cont_type is not None: cont_type = cont_type.group(1) cont_type = cont_type.rstrip('\r') cont_type = cont_type[:128] #print "Time after http parsing: %f" %(time.time(),) # Database statement cursor.execute( """ INSERT INTO pe_dumps(sha1,md5,timestamp,server,client,method,url,host, referer,server_application,content_type,dst_port,corrupt,file_size,file_type) VALUES (%s,%s,TO_TIMESTAMP(%s),%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)""", (sha1, md5, timestamp, srcip, dstip, method, url, host, referer, server, cont_type, dst_port, corrupt_pe, file_size, file_type)) cursor.execute( """ SELECT dump_id FROM pe_dumps where sha1 = %s ORDER BY dump_id DESC LIMIT 1 """, (sha1, )) dump_id = cursor.fetchone()[0] print("A new entry on host:%s has been made in pe_dumps table with " "dump_id %s" % (host, dump_id)) fileHandle.close() cursor.close() conn.close() return dump_id, corrupt_pe, host, dstip, srcip