def classify_dump(dump_id): print_arff(dump_id) subprocess.call( "java -Xmx2000m -cp ./weka.jar " "weka.classifiers.meta.FilteredClassifier " "-l %s -p 1,58,59 -distribution -T test.arff " "> test.result" % (model_file,), shell=True) conn = util.connect_to_db() cursor = conn.cursor() score = None with open('test.result', 'r') as f: for line in f: if ':' in line: for word in line.split(): if '*' in word: score = word.split(',')[0] if score.startswith('*'): score = score[1:] print "AMICO Score:", score cursor.execute(""" DELETE FROM amico_scores WHERE dump_id = %s""", (dump_id, )) cursor.execute("INSERT INTO amico_scores VALUES " "(%s, %s)", (dump_id, score)) subprocess.call("rm test.arff", shell=True) subprocess.call("rm test.result", shell=True)
def db_syslog(dump_id): time.sleep(WAIT_TIME) conn = util.connect_to_db() cursor = conn.cursor() make_syslog_entry(cursor, dump_id) cursor.close() conn.close()
def classify_dump(dump_id): print_arff(dump_id) subprocess.call("java -Xmx2000m -cp ./weka.jar " "weka.classifiers.meta.FilteredClassifier " "-l %s -p 1,58,59 -distribution -T test.arff " "> test.result" % (model_file, ), shell=True) conn = util.connect_to_db() cursor = conn.cursor() score = None with open('test.result', 'r') as f: for line in f: if ':' in line: for word in line.split(): if '*' in word: score = word.split(',')[0] if score.startswith('*'): score = score[1:] print "AMICO Score:", score cursor.execute( """ DELETE FROM amico_scores WHERE dump_id = %s""", (dump_id, )) cursor.execute("INSERT INTO amico_scores VALUES " "(%s, %s)", (dump_id, score)) subprocess.call("rm test.arff", shell=True) subprocess.call("rm test.result", shell=True)
def db_syslog(dump_id): time.sleep(WAIT_TIME) conn = util.connect_to_db() cursor = conn.cursor() make_syslog_entry(cursor, dump_id) cursor.close() conn.close()
def main(): key_bytes = [random.randint(0, 255) for _ in range(3)] conn = util.connect_to_db() cursor = conn.cursor() cursor.execute(""" DROP TABLE IF EXISTS pe_dumps_copy """) cursor.execute(""" CREATE TABLE pe_dumps_copy AS TABLE pe_dumps """) cursor.execute(""" SELECT DISTINCT client FROM pe_dumps_copy """) orig_clients = [row[0] for row in cursor.fetchall()] anony_clients = {} num_ips = len(orig_clients) for ip in orig_clients: anony_clients[ip] = anonymize_ip(ip, key_bytes) past_progress = 0 for i, ip in enumerate(anony_clients): progress = round((float(i) / num_ips), 2) if progress > past_progress: drawProgressBar(progress) past_progress = progress cursor.execute(""" UPDATE pe_dumps_copy SET client = %s WHERE client = %s """, (anony_clients[ip], ip)) print "\n Made a copy of pe_dumps table with anonymized client IPs!!" cursor.close() conn.close()
def __init__(self,): self.output_file = "train.arff" self.conn = util.connect_to_db() self.conn.set_isolation_level(psycopg2.extensions.ISOLATION_LEVEL_READ_COMMITTED) self.clean_label_delta = timedelta(days=30) self.training_end_date = date.today() if training_start_date: self.training_start_date = datetime.strptime(training_start_date, "%Y-%m-%d") else: cursor = self.conn.cursor() cursor.execute(""" SELECT MIN(timestamp) FROM pe_dumps""") if cursor.rowcount > 0: self.training_start_date = cursor.fetchone()[0].date() else: print "No entries in the database to train!" sys.exit() cursor.close() if training_days: self.training_end_date = (self.training_start_date + timedelta(days=training_days)) print "Training start date:", self.training_start_date.strftime("%B %d, %Y") print "Training end date:", self.training_end_date.strftime("%B %d, %Y")
def update_score(dump_id, score): conn = util.connect_to_db() cursor = conn.cursor() cursor.execute( """ DELETE FROM amico_scores WHERE dump_id = %s""", (dump_id, )) cursor.execute("INSERT INTO amico_scores VALUES " "(%s, %s)", (dump_id, score))
def main(): conn = util.connect_to_db() logging.basicConfig(level=logging.DEBUG, filename=LOG_FILE, filemode='w') raw_file_names = os.listdir(RAW_FILE_DIR) for fn in raw_file_names: file_path = os.path.join(RAW_FILE_DIR, fn) print "Analyzing file:", file_path update_url(file_path, conn)
def print_arff(dump_id): conn = util.connect_to_db() cursor = conn.cursor(cursor_factory=psycopg2.extras.NamedTupleCursor) cursor.execute( """ SELECT * FROM weka_features WHERE dump_id = %s""", (dump_id, )) if cursor.rowcount == 0: print "Feature vector not found. Exiting..." return res = cursor.fetchone() res = res._asdict() del res['raw_dump_num_av_labels'] del res['raw_dump_trusted_av_labels'] w = open(output_file, 'w') w.write('@RELATION test\n\n') values = [] for feature in features: if feature in [ 'sha1', 'dump_id', 'host', 'corrupt', 'vt_month_shelf', 'url_struct' ]: data_type = "STRING" elif feature == "extension_class": data_type = ("{common_ext,unknown_ext,common_fake,other_ext," "no_url,no_ext}") else: data_type = "NUMERIC" w.write('@ATTRIBUTE %s %s\n' % (feature, data_type)) values.append(res[feature]) #print "%s : %s" % (key, res[key]) w.write('@ATTRIBUTE class {pos, neg}\n\n') w.write('@DATA\n\n') try: data_string = ','.join([ '?' if (value is None or value is '') else str(value) for value in values ]) except Exception as e: print "Error in writing feature vector to file!", e else: data_string += ",?" w.write(data_string + '\n') w.close() cursor.close() conn.close()
def manual_download(captured_sha1): util.setup_socks() conn = util.connect_to_db() cursor = conn.cursor() # Database query to get the relevant recent record cursor.execute( """ SELECT dump_id,host,url,referer,client,server FROM pe_dumps WHERE sha1 = %s ORDER BY timestamp DESC;""", (captured_sha1, )) row = cursor.fetchone() dump_id = row[0] host = row[1] url = row[2] referer = row[3] client = row[4] server = row[5] full_url = "http://" ordered_host = server # if host is null, we use ther server IP if host: ordered_host = util.reorder_domain(host) full_url += ordered_host if url: full_url += url print "Starting manual download from :", full_url # Prepare the urllib2 request req = urllib2.Request(full_url) req.add_header("User-Agent", USER_AGENT) download_time = time.time() sha1, md5, different, is_interesting_file = download_file( dump_id, req, captured_sha1) # Database statement cursor.execute( """ INSERT INTO manual_download_checksums(dump_id, sha1, md5, different, referer_exists, timestamp, is_pe) VALUES (%s, %s, %s, %s, %s, TO_TIMESTAMP(%s), %s)""", (dump_id, sha1, md5, different, False, download_time, is_interesting_file)) cursor.close() conn.close()
def build(window): conn = util.connect_to_db() cursor = conn.cursor() cursor.execute("""SELECT max(dump_id) from pe_dumps""") lastID = cursor.fetchone()[0] minID = lastID - window currID = lastID - window while currID < lastID: cursor.execute("""SELECT file_type FROM pe_dumps WHERE dump_id=%d""" % (currID, )) file_extension = cursor.fetchone()[0] get_feature_vector(currID, file_extension, minID) currID += 1
def print_arff(dump_id): conn = util.connect_to_db() cursor = conn.cursor(cursor_factory=psycopg2.extras.NamedTupleCursor) cursor.execute(""" SELECT * FROM weka_features WHERE dump_id = %s""", (dump_id, )) if cursor.rowcount == 0: print "Feature vector not found. Exiting..." return res = cursor.fetchone() res = res._asdict() del res['raw_dump_num_av_labels'] del res['raw_dump_trusted_av_labels'] w = open(output_file, 'w') w.write('@RELATION test\n\n') values = [] for feature in features: if feature in ['sha1', 'dump_id', 'host', 'corrupt', 'vt_month_shelf', 'url_struct']: data_type = "STRING" elif feature == "extension_class": data_type = ("{common_ext,unknown_ext,common_fake,other_ext," "no_url,no_ext}") else: data_type = "NUMERIC" w.write('@ATTRIBUTE %s %s\n' % (feature, data_type)) values.append(res[feature]) #print "%s : %s" % (key, res[key]) w.write('@ATTRIBUTE class {pos, neg}\n\n') w.write('@DATA\n\n') try: data_string = ','.join(['?' if (value is None or value is '') else str(value) for value in values]) except Exception as e: print "Error in writing feature vector to file!", e else: data_string += ",?" w.write(data_string + '\n') w.close() cursor.close() conn.close()
def build(window): conn = util.connect_to_db() cursor = conn.cursor() cursor.execute("""SELECT max(dump_id) from pe_dumps""") lastID = cursor.fetchone()[0] minID = lastID - window currID = lastID - window while currID < lastID: cursor.execute("""SELECT file_type FROM pe_dumps WHERE dump_id=%d""" % (currID,)) # file_extension = cursor.fetchone()[0] # get_feature_vector(currID,file_extension, minID) get_feature_vector(currID) currID += 1
def __init__(self): self.QUERY_RATE_LIMIT = 10 self.ONE_MIN = 60 logging.config.fileConfig(LOG_CONF_FILE) self.logger = logging.getLogger("amico_logger") #stdout_handler = logging.StreamHandler(sys.stdout) #stdout_handler.setLevel(logging.DEBUG) #formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s' #'- %(message)s') #stdout_handler.setFormatter(formatter) #self.logger.addHandler(stdout_handler) util.setup_socks() self.conn = util.connect_to_db() self.cursor = self.conn.cursor() self.today = date.today().strftime("%Y-%m-%d") self.yesterday = (date.today() - timedelta(days=1)).strftime("%Y-%m-%d") self.last_month = (date.today() - timedelta(days=30)).strftime("%Y-%m-%d")
def __init__(self): self.QUERY_RATE_LIMIT = 10 self.ONE_MIN = 60 logging.config.fileConfig(LOG_CONF_FILE) self.logger = logging.getLogger("amico_logger") #stdout_handler = logging.StreamHandler(sys.stdout) #stdout_handler.setLevel(logging.DEBUG) #formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s' #'- %(message)s') #stdout_handler.setFormatter(formatter) #self.logger.addHandler(stdout_handler) util.setup_socks() self.conn = util.connect_to_db() self.cursor = self.conn.cursor() self.today = date.today().strftime("%Y-%m-%d") self.yesterday = (date.today() - timedelta(days=1)).strftime("%Y-%m-%d") self.last_month = (date.today() - timedelta(days=30)).strftime("%Y-%m-%d")
def manual_download(captured_sha1): util.setup_socks() conn = util.connect_to_db() cursor = conn.cursor() # Database query to get the relevant recent record cursor.execute(""" SELECT dump_id,host,url,referer,client,server FROM pe_dumps WHERE sha1 = %s ORDER BY timestamp DESC;""", (captured_sha1,)) row = cursor.fetchone() dump_id = row[0] host = row[1] url = row[2] referer = row[3] client = row[4] server = row[5] if host is None: host = server ordered_host = util.reorder_domain(host) full_url = "http://" + ordered_host + url #print full_url # Prepare the urllib2 request req = urllib2.Request(full_url) req.add_header("User-Agent", USER_AGENT) download_time = time.time() sha1, md5, different, is_pe = download_file(dump_id, req, captured_sha1) # Database statement cursor.execute(""" INSERT INTO manual_download_checksums(dump_id, sha1, md5, different, referer_exists, timestamp, is_pe) VALUES (%s, %s, %s, %s, %s, TO_TIMESTAMP(%s), %s)""", (dump_id, sha1, md5, different, False, download_time, is_pe)) cursor.close() conn.close()
def get_feature_vector(dump_id): #print "entered get_feature_vector" conn = util.connect_to_db() cursor = conn.cursor() insert_features(cursor, dump_id) print "Done inserting features for dump_id: ", dump_id
(%(document_id)s, %(paragraph_id)s, %(theorem_type)s, %(text)s) """ authorshipInsertStmt = """ INSERT INTO authorship(document, rank, display_name, zbmath_id) VALUES (%(document_id)s, %(rank)s, %(display_name)s, %(zbmath_id)s) """ mscAssignmentInsertStmt = """ INSERT INTO msc_assignment(document, msc, pos) VALUES (%(document_id)s, %(msc)s, %(pos)s) """ db = connect_to_db() cursor = db.cursor() warning_log = open("warning_log", "a") p = DocumentParser() # filepath = "raw_data/test_documents/07040005.xml" # for filename in filesInDict("raw_data/test_documents", True): for filename, filepath in zip(filenames, filepaths): sys.stdout.write("processing " + filename + "... ") # doc, tokenizedParagraphs, formulaDict = p.parseWithParagraphStructure(filename) doc, raw_paragraphs, formula_dict = p.parse_raw(filepath) # info for doc table: document_id = doc.arxiv_id() publication_date = doc.publication_date
def generate_CSV_download_file(): connection = util.connect_to_db() connection_cursor = connection.cursor() csv_writer = None header = "Second_ID,Mal_APK,Tot_APK,Mal_DMG,Tot_DMG,Mal_ELF,Tot_ELF,Mal_EXE,Tot_EXE,Mal_PDF,Tot_PDF,Mal_SWF," + \ "Tot_SWF,Mal_JAR,Tot_JAR,Mal_RAR,Tot_RAR,Mal_ZIP,Tot_ZIP,Timestamp,Next_Download_Event_[s]" header_list = [ "Second_ID", "Mal_APK", "Tot_APK", "Mal_DMG", "Tot_DMG", "Mal_ELF", "Tot_ELF", "Mal_EXE", "Tot_EXE", "Mal_PDF", "Tot_PDF", "Mal_SWF", "Tot_SWF", "Mal_JAR", "Tot_JAR", "Mal_RAR", "Tot_RAR", "Mal_ZIP", "Tot_ZIP", "Timestamp", "Next_Download_Event_[s]" ] created_csv_file = OUT_DIR + "/" + str(DOWNLOAD_GRAPH_ID) + "-downloads_" + \ datetime.fromtimestamp(time()).strftime('%Y-%m-%d_%H-%M-%S') + ".csv" with open(created_csv_file, "wb") as csv_file: csv_writer = csv.writer(csv_file) csv_writer.writerow(header_list) csv_map = defaultdict(list) malware_timestamp_set = set() ##################################################### EXECUTABLES ##################################################### query = """SELECT timestamp, COUNT(pe.file_type) FROM pe_dumps AS pe, amico_scores AS ams WHERE """ + \ """pe.dump_id = ams.dump_id AND pe.file_type = 'APK' AND ams.score > """ + str(AMICO_THRESHOLD) + \ """GROUP BY timestamp ORDER BY timestamp ASC""" connection_cursor.execute(query) for row in connection_cursor: if row is not None: timestamp = str(row[0]) malware_apk_count_per_second = row[1] csv_map[timestamp].append(malware_apk_count_per_second) malware_timestamp_set.add(timestamp) query = """SELECT timestamp, COUNT(file_type) FROM pe_dumps WHERE file_type = 'APK' GROUP BY timestamp """ + \ """ORDER BY timestamp ASC""" connection_cursor.execute(query) for row in connection_cursor: if row is not None: timestamp = str(row[0]) total_apk_count_per_second = row[1] if malware_timestamp_set.__contains__(timestamp): csv_map[timestamp].append(total_apk_count_per_second) else: csv_map[timestamp].extend([0, total_apk_count_per_second]) malware_timestamp_set = set() query = """SELECT timestamp, COUNT(pe.file_type) FROM pe_dumps AS pe, amico_scores AS ams WHERE """ + \ """pe.dump_id = ams.dump_id AND pe.file_type = 'DMG' AND ams.score > """ + str(AMICO_THRESHOLD) + \ """GROUP BY timestamp ORDER BY timestamp ASC""" connection_cursor.execute(query) for row in connection_cursor: if row is not None: timestamp = str(row[0]) malware_dmg_count_per_second = row[1] if timestamp not in csv_map: csv_map[timestamp].extend([0, 0]) csv_map[timestamp].append(malware_dmg_count_per_second) malware_timestamp_set.add(timestamp) query = """SELECT timestamp, COUNT(file_type) FROM pe_dumps WHERE file_type = 'DMG' GROUP BY timestamp """ + \ """ORDER BY timestamp ASC""" connection_cursor.execute(query) for row in connection_cursor: if row is not None: timestamp = str(row[0]) total_dmg_count_per_second = row[1] if timestamp not in csv_map: csv_map[timestamp].extend([0, 0]) if malware_timestamp_set.__contains__(timestamp): csv_map[timestamp].append(total_dmg_count_per_second) else: csv_map[timestamp].extend([0, total_dmg_count_per_second]) malware_timestamp_set = set() query = """SELECT timestamp, COUNT(pe.file_type) FROM pe_dumps AS pe, amico_scores AS ams WHERE """ + \ """pe.dump_id = ams.dump_id AND pe.file_type = 'ELF' AND ams.score > """ + str(AMICO_THRESHOLD) + \ """GROUP BY timestamp ORDER BY timestamp ASC""" connection_cursor.execute(query) for row in connection_cursor: if row is not None: timestamp = str(row[0]) malware_elf_count_per_second = row[1] if timestamp not in csv_map: csv_map[timestamp].extend([0, 0, 0, 0]) csv_map[timestamp].append(malware_elf_count_per_second) malware_timestamp_set.add(timestamp) query = """SELECT timestamp, COUNT(file_type) FROM pe_dumps WHERE file_type = 'ELF' GROUP BY timestamp """ + \ """ORDER BY timestamp ASC""" connection_cursor.execute(query) for row in connection_cursor: if row is not None: timestamp = str(row[0]) total_elf_count_per_second = row[1] if timestamp not in csv_map: csv_map[timestamp].extend([0, 0, 0, 0]) if malware_timestamp_set.__contains__(timestamp): csv_map[timestamp].append(total_elf_count_per_second) else: csv_map[timestamp].extend([0, total_elf_count_per_second]) malware_timestamp_set = set() query = """SELECT timestamp, COUNT(pe.file_type) FROM pe_dumps AS pe, amico_scores AS ams WHERE """ + \ """pe.dump_id = ams.dump_id AND pe.file_type = 'EXE' AND ams.score > """ + str(AMICO_THRESHOLD) + \ """GROUP BY timestamp ORDER BY timestamp ASC""" connection_cursor.execute(query) for row in connection_cursor: if row is not None: timestamp = str(row[0]) malware_exe_count_per_second = row[1] if timestamp not in csv_map: csv_map[timestamp].extend([0, 0, 0, 0, 0, 0]) csv_map[timestamp].append(malware_exe_count_per_second) malware_timestamp_set.add(timestamp) query = """SELECT timestamp, COUNT(file_type) FROM pe_dumps WHERE file_type = 'EXE' GROUP BY timestamp """ + \ """ORDER BY timestamp ASC""" connection_cursor.execute(query) for row in connection_cursor: if row is not None: timestamp = str(row[0]) total_exe_count_per_second = row[1] if timestamp not in csv_map: csv_map[timestamp].extend([0, 0, 0, 0, 0, 0]) if malware_timestamp_set.__contains__(timestamp): csv_map[timestamp].append(total_exe_count_per_second) else: csv_map[timestamp].extend([0, total_exe_count_per_second]) ######################################################################################################################## ######################################################### PDF ######################################################### malware_timestamp_set = set() query = """SELECT timestamp, COUNT(pe.file_type) FROM pe_dumps AS pe, amico_scores AS ams WHERE """ + \ """pe.dump_id = ams.dump_id AND pe.file_type = 'PDF' AND ams.score > """ + str(AMICO_THRESHOLD) + \ """GROUP BY timestamp ORDER BY timestamp ASC""" connection_cursor.execute(query) for row in connection_cursor: if row is not None: timestamp = str(row[0]) malware_pdf_count_per_second = row[1] if timestamp not in csv_map: csv_map[timestamp].extend([0, 0, 0, 0, 0, 0, 0, 0]) csv_map[timestamp].append(malware_pdf_count_per_second) malware_timestamp_set.add(timestamp) query = """SELECT timestamp, COUNT(file_type) FROM pe_dumps WHERE file_type = 'PDF' GROUP BY timestamp """ + \ """ORDER BY timestamp ASC""" connection_cursor.execute(query) for row in connection_cursor: if row is not None: timestamp = str(row[0]) total_pdf_count_per_second = row[1] if timestamp not in csv_map: csv_map[timestamp].extend([0, 0, 0, 0, 0, 0, 0, 0]) if malware_timestamp_set.__contains__(timestamp): csv_map[timestamp].append(total_pdf_count_per_second) else: csv_map[timestamp].extend([0, total_pdf_count_per_second]) ######################################################################################################################## ######################################################## FLASH ######################################################## malware_timestamp_set = set() query = """SELECT timestamp, COUNT(pe.file_type) FROM pe_dumps AS pe, amico_scores AS ams WHERE """ + \ """pe.dump_id = ams.dump_id AND pe.file_type = 'SWF' AND ams.score > """ + str(AMICO_THRESHOLD) + \ """GROUP BY timestamp ORDER BY timestamp ASC""" connection_cursor.execute(query) for row in connection_cursor: if row is not None: timestamp = str(row[0]) malware_swf_count_per_second = row[1] if timestamp not in csv_map: csv_map[timestamp].extend([0, 0, 0, 0, 0, 0, 0, 0, 0, 0]) csv_map[timestamp].append(malware_swf_count_per_second) malware_timestamp_set.add(timestamp) query = """SELECT timestamp, COUNT(file_type) FROM pe_dumps WHERE file_type = 'SWF' GROUP BY timestamp """ + \ """ORDER BY timestamp ASC""" connection_cursor.execute(query) for row in connection_cursor: if row is not None: timestamp = str(row[0]) total_swf_count_per_second = row[1] if timestamp not in csv_map: csv_map[timestamp].extend([0, 0, 0, 0, 0, 0, 0, 0, 0, 0]) if malware_timestamp_set.__contains__(timestamp): csv_map[timestamp].append(total_swf_count_per_second) else: csv_map[timestamp].extend([0, total_swf_count_per_second]) ######################################################################################################################## ###################################################### COMPRESSED ###################################################### malware_timestamp_set = set() query = """SELECT timestamp, COUNT(pe.file_type) FROM pe_dumps AS pe, amico_scores AS ams WHERE """ + \ """pe.dump_id = ams.dump_id AND pe.file_type = 'JAR' AND ams.score > """ + str(AMICO_THRESHOLD) + \ """GROUP BY timestamp ORDER BY timestamp ASC""" connection_cursor.execute(query) for row in connection_cursor: if row is not None: timestamp = str(row[0]) malware_jar_count_per_second = row[1] if timestamp not in csv_map: csv_map[timestamp].extend([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]) csv_map[timestamp].append(malware_jar_count_per_second) malware_timestamp_set.add(timestamp) query = """SELECT timestamp, COUNT(file_type) FROM pe_dumps WHERE file_type = 'JAR' GROUP BY timestamp """ + \ """ORDER BY timestamp ASC""" connection_cursor.execute(query) for row in connection_cursor: if row is not None: timestamp = str(row[0]) total_jar_count_per_second = row[1] if timestamp not in csv_map: csv_map[timestamp].extend([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]) if malware_timestamp_set.__contains__(timestamp): csv_map[timestamp].append(total_jar_count_per_second) else: csv_map[timestamp].extend([0, total_jar_count_per_second]) malware_timestamp_set = set() query = """SELECT timestamp, COUNT(pe.file_type) FROM pe_dumps AS pe, amico_scores AS ams WHERE """ + \ """pe.dump_id = ams.dump_id AND pe.file_type = 'RAR' AND ams.score > """ + str(AMICO_THRESHOLD) + \ """GROUP BY timestamp ORDER BY timestamp ASC""" connection_cursor.execute(query) for row in connection_cursor: if row is not None: timestamp = str(row[0]) malware_rar_count_per_second = row[1] if timestamp not in csv_map: csv_map[timestamp].extend( [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]) csv_map[timestamp].append(malware_rar_count_per_second) malware_timestamp_set.add(timestamp) query = """SELECT timestamp, COUNT(file_type) FROM pe_dumps WHERE file_type = 'RAR' GROUP BY timestamp """ + \ """ORDER BY timestamp ASC""" connection_cursor.execute(query) for row in connection_cursor: if row is not None: timestamp = str(row[0]) total_rar_count_per_second = row[1] if timestamp not in csv_map: csv_map[timestamp].extend( [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]) if malware_timestamp_set.__contains__(timestamp): csv_map[timestamp].append(total_rar_count_per_second) else: csv_map[timestamp].extend([0, total_rar_count_per_second]) malware_timestamp_set = set() query = """SELECT timestamp, COUNT(pe.file_type) FROM pe_dumps AS pe, amico_scores AS ams WHERE """ + \ """pe.dump_id = ams.dump_id AND pe.file_type = 'ZIP' AND ams.score > """ + str(AMICO_THRESHOLD) + \ """GROUP BY timestamp ORDER BY timestamp ASC""" connection_cursor.execute(query) for row in connection_cursor: if row is not None: timestamp = str(row[0]) malware_zip_count_per_second = row[1] if timestamp not in csv_map: csv_map[timestamp].extend( [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]) csv_map[timestamp].append(malware_zip_count_per_second) malware_timestamp_set.add(timestamp) query = """SELECT timestamp, COUNT(file_type) FROM pe_dumps WHERE file_type = 'ZIP' GROUP BY timestamp """ + \ """ORDER BY timestamp ASC""" connection_cursor.execute(query) for row in connection_cursor: if row is not None: timestamp = str(row[0]) total_zip_count_per_second = row[1] if timestamp not in csv_map: csv_map[timestamp].extend( [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]) if malware_timestamp_set.__contains__(timestamp): csv_map[timestamp].append(total_zip_count_per_second) else: csv_map[timestamp].extend([0, total_zip_count_per_second]) ######################################################################################################################## sorted_csv_map = sorted(csv_map.items(), key=operator.itemgetter(0)) csv_map_aux = defaultdict(list) first_useful_date = "2014-11-26 22:55:40" last_useful_date = "2015-10-01 00:00:00" # Loop for handling corrupted timestamp for timestamp, file_list in sorted_csv_map: if cmp(timestamp, first_useful_date) < 0: timestamp_split = timestamp.split() first_useful_date_split = first_useful_date.split() timestamp_hms = timestamp_split[1] first_useful_date_ymd = first_useful_date_split[0] corrected_timestamp = first_useful_date_ymd + " " + timestamp_hms csv_map_aux[corrected_timestamp] = csv_map.get(timestamp) else: break max_values = len(header.split(',')) - 2 csv_rows = list() sorted_csv_map_aux = sorted(csv_map_aux.items(), key=operator.itemgetter(0)) UID = 0 for timestamp, file_list in sorted_csv_map_aux: formatted_row = format_row(file_list, max_values) formatted_row.insert(0, UID) formatted_row.append(timestamp) csv_rows.append(formatted_row) UID += 1 writable_csv_rows = list() while csv_rows: current_row = csv_rows.pop(0) if not csv_rows: writable_csv_rows.append(current_row) continue next_row = csv_rows[0] timestamp_index = len(current_row) - 1 current_timestamp_string = current_row[timestamp_index] next_timestamp_string = next_row[timestamp_index] current_timestamp = datetime.strptime(current_timestamp_string, '%Y-%m-%d %H:%M:%S') next_timestamp = datetime.strptime(next_timestamp_string, '%Y-%m-%d %H:%M:%S') time_delta_in_secs = int( (next_timestamp - current_timestamp).total_seconds()) - 1 current_row.append(time_delta_in_secs) writable_csv_rows.append(current_row) writable_sorted_csv_map = list() for timestamp, file_list in sorted_csv_map: if cmp(timestamp, first_useful_date) < 0 or cmp( timestamp, last_useful_date) > 0: continue else: writable_sorted_csv_map.append([timestamp, file_list]) writable_csv_rows_aux = list() while writable_sorted_csv_map: timestamp_file_list_first_pair = writable_sorted_csv_map.pop(0) timestamp_str_first_pair = timestamp_file_list_first_pair[0] file_list_first_pair = timestamp_file_list_first_pair[1] if not writable_sorted_csv_map: formatted_row = format_row(file_list_first_pair, max_values) formatted_row.insert(0, UID) formatted_row.append(timestamp_str_first_pair) writable_csv_rows_aux.append(formatted_row) UID += 1 continue timestamp_file_list_second_pair = writable_sorted_csv_map[0] timestamp_str_second_pair = timestamp_file_list_second_pair[0] formatted_row = format_row(file_list_first_pair, max_values) formatted_row.insert(0, UID) formatted_row.append(timestamp_str_first_pair) timestamp_first_pair = datetime.strptime(timestamp_str_first_pair, '%Y-%m-%d %H:%M:%S') timestamp_second_pair = datetime.strptime(timestamp_str_second_pair, '%Y-%m-%d %H:%M:%S') time_delta_in_secs = int( (timestamp_second_pair - timestamp_first_pair).total_seconds()) - 1 formatted_row.append(time_delta_in_secs) writable_csv_rows_aux.append(formatted_row) UID += 1 last_formatted_row_in_writable_csv_rows = writable_csv_rows.pop( len(writable_csv_rows) - 1) first_formatted_row_in_writable_csv_rows_aux = writable_csv_rows_aux[0] timestamp_index = len(last_formatted_row_in_writable_csv_rows) - 1 current_timestamp_string = last_formatted_row_in_writable_csv_rows[ timestamp_index] next_timestamp_string = first_formatted_row_in_writable_csv_rows_aux[ timestamp_index] current_timestamp = datetime.strptime(current_timestamp_string, '%Y-%m-%d %H:%M:%S') next_timestamp = datetime.strptime(next_timestamp_string, '%Y-%m-%d %H:%M:%S') time_delta_in_secs = int( (next_timestamp - current_timestamp).total_seconds()) - 1 last_formatted_row_in_writable_csv_rows.append(time_delta_in_secs) writable_csv_rows_aux.insert(0, last_formatted_row_in_writable_csv_rows) with open(created_csv_file, "a") as csv_file: csv_writer = csv.writer(csv_file, csv.QUOTE_NONNUMERIC) for row in writable_csv_rows: csv_writer.writerow(row) for row in writable_csv_rows_aux: csv_writer.writerow(row)
def fe_db_setup(): conn = util.connect_to_db() cursor = conn.cursor() cursor.execute(""" DROP table if exists features""") cursor.execute(""" DROP table if exists weka_features""") cursor.execute(""" CREATE TABLE weka_features( dump_id INT, raw_dump_num_av_labels INT, raw_dump_trusted_av_labels INT, vt_month_shelf BOOLEAN, corrupt BOOLEAN, host_malware_downloads INT, host_suspicious_downloads INT, host_benign_downloads INT, host_total_downloads INT, host_malware_ratio REAL, host_suspicious_ratio REAL, host_benign_ratio REAL, host_avg_av_labels REAL, host_avg_trusted_labels REAL, host_unknown_hashes INT, host_total_hashes INT, host_unknown_hash_ratio REAL, twold_malware_downloads INT, twold_suspicious_downloads INT, twold_benign_downloads INT, twold_total_downloads INT, twold_malware_ratio REAL, twold_suspicious_ratio REAL, twold_benign_ratio REAL, twold_avg_av_labels REAL, twold_avg_trusted_labels REAL, twold_unknown_hashes INT, twold_total_hashes INT, twold_unknown_hash_ratio REAL, server_ip_malware_downloads INT, server_ip_suspicious_downloads INT, server_ip_benign_downloads INT, server_ip_total_downloads INT, server_ip_malware_ratio REAL, server_ip_suspicious_ratio REAL, server_ip_benign_ratio REAL, server_ip_avg_av_labels REAL, server_ip_avg_trusted_labels REAL, server_ip_unknown_hashes INT, server_ip_total_hashes INT, server_ip_unknown_hash_ratio REAL, bgp_malware_downloads INT, bgp_suspicious_downloads INT, bgp_benign_downloads INT, bgp_total_downloads INT, bgp_malware_ratio REAL, bgp_suspicious_ratio REAL, bgp_benign_ratio REAL, bgp_avg_av_labels REAL, bgp_avg_trusted_labels REAL, bgp_unknown_hashes INT, bgp_total_hashes INT, bgp_unknown_hash_ratio REAL, hash_life_time INT, num_dumps_with_same_hash INT, hash_daily_dump_rate_per_client REAL, estimated_clients_with_same_hash INT, referer_exists INT, host_name_exists INT, extension_class VARCHAR(20), url_length INT, directory_depth INT, sha1 VARCHAR(40), host VARCHAR(256), url_malware_downloads INT, url_total_downloads INT, url_distinct_sha1s INT, url_struct VARCHAR(512), url_struct_malware_downloads INT, url_struct_total_downloads INT, url_struct_distinct_sha1s INT) """) print "Created weka_features table!" conn.commit() cursor.close() conn.close()
__author__ = 'vincenzo' import util import db_extraction import pickle import dill import re import urlparse import numpy as np for i in range(11): sketch = pickle.load(open("./%d/url_struct_total_downloads-%d.p" % (i,i),'rb')) db = db_extraction.DBextraction() conn = util.connect_to_db() cursor = conn.cursor() inner_cursor = conn.cursor() start_id = db.maxID - 10000 cursor.execute("""SELECT distinct url FROM pe_dumps AS pe WHERE pe.dump_id > %d and url is not null """ % (start_id,)) for row in cursor: if row is not None: url = row[0]
def db_virus_total(dump_id): logging.config.fileConfig(LOG_CONF_FILE) logger = logging.getLogger("amico_logger") util.setup_socks() conn = util.connect_to_db() cursor = conn.cursor() # Exit if this sha1 has been queried in the past VT_QUERY_INTERVAL period prev_query_time = datetime(MINYEAR, 1, 1, 0, 0, 0, 0) time_now = datetime.now() cursor.execute( """ SELECT sha1, md5 FROM pe_dumps WHERE dump_id = %s""", (dump_id, )) (sha1, md5) = cursor.fetchone() try: cursor.execute( "SELECT query_time, vt_id FROM virus_total_scans " "WHERE sha1 = %s " "ORDER by query_time DESC", (sha1, )) res = cursor.fetchone() if res: prev_query_time = res[0] vt_id = res[1] except: print "sha1:%s no previous VT query" % (sha1, ) pass vt_query_period = timedelta(days=VT_QUERY_INTERVAL) if (time_now - prev_query_time) < vt_query_period: print "sha1:%s has been queried recently. Skipping..." % (sha1, ) cursor.execute( """ INSERT INTO ped_vts_mapping (dump_id, vt_id) VALUES (%s, %s)""", (dump_id, vt_id)) conn.close() return tries = 0 success = False while tries < MAX_TRIES: try: tries += 1 json = vt_api.get_vt_report(md5) if not json: continue report = simplejson.loads(json) if report["response_code"] == 1: insert_report(cursor, report, sha1, md5, json, dump_id) success = True break elif report["response_code"] == 0: cursor.execute( """ INSERT INTO virus_total_scans(sha1, md5, query_time) VALUES (%s, %s, CLOCK_TIMESTAMP()) RETURNING vt_id """, (sha1, md5)) vt_id = cursor.fetchone()[0] cursor.execute( """ INSERT INTO ped_vts_mapping (dump_id, vt_id) VALUES (%s, %s)""", (dump_id, vt_id)) print "Virus Total: No scan report exists in the VT database" success = True break else: logger.exception("Unknown response code! %s" % (report["response_code"], )) time.sleep(1) except Exception as e: print e logger.exception( "Try %s. Error in fetching report for md5 %s: %s" % (tries, md5, e)) time.sleep(5) if not success: cursor.execute( """ INSERT INTO ped_vts_mapping (dump_id) VALUES (%s)""", (dump_id, )) logger.warning("Giving up on dump_id: %s's VT report" % (dump_id, )) cursor.close() conn.close()
(%(document_id)s, %(paragraph_id)s, %(theorem_type)s, %(text)s) """ authorshipInsertStmt = """ INSERT INTO authorship(document, rank, display_name, zbmath_id) VALUES (%(document_id)s, %(rank)s, %(display_name)s, %(zbmath_id)s) """ mscAssignmentInsertStmt = """ INSERT INTO msc_assignment(document, msc, pos) VALUES (%(document_id)s, %(msc)s, %(pos)s) """ db = connect_to_db() cursor = db.cursor() warning_log = open("warning_log", "a") p = DocumentParser() # filepath = "raw_data/test_documents/07040005.xml" # for filename in filesInDict("raw_data/test_documents", True): for filename, filepath in zip(filenames, filepaths): sys.stdout.write("processing " + filename + "... ") # doc, tokenizedParagraphs, formulaDict = p.parseWithParagraphStructure(filename) doc, raw_paragraphs, formula_dict = p.parse_raw(filepath) # info for doc table: document_id = doc.arxiv_id() publication_date = doc.publication_date
def db_file_dumps(file_path, sha1, md5, file_size, file_type): #print "Time b4 http parsing: %f" %(time.time(),) # Use Autocommit mode for database connection conn = util.connect_to_db() cursor = conn.cursor() fileHandle = open(file_path) # Timestamp r = re.compile('[0-9]+') timestamp = r.search(fileHandle.readline()) if timestamp is not None: timestamp = timestamp.group() #print timestamp.group() # Source and Destination IPs r = re.compile('([0-9.]+):.*-([0-9.]+):([0-9]+)-.*') ip = r.search(fileHandle.readline()) if ip is not None: srcip = ip.group(2) dstip = ip.group(1) dst_port = ip.group(3) #print ip.group(1) #print ip.group(2) else: srcip = None dstip = None dst_port = None # URL r = re.compile('(GET|POST|HEAD) (.*) ') url = r.search(fileHandle.readline()) if url is not None: method = url.group(1) method = method[:10] url = url.group(2) #print url.group(1) else: method = None # Host r = re.compile('Host: (.*)') host = r.search(fileHandle.readline()) if host is not None: host = host.group(1) host = util.reorder_domain(host.strip()) #print host.group(1) # Referer r = re.compile('Referer: (.*)') referer = r.search(fileHandle.readline()) if referer is not None: referer = referer.group(1) #print referrer.group(1) # CORRUPT_PE corrupt_pe = False r = re.compile('CORRUPT_FILE') corrupt_pe_str = r.search(fileHandle.readline()) if corrupt_pe_str is not None: corrupt_pe = True # Now, parse data from the response # Server data = fileHandle.read() r = re.compile('Server: (.*)') server = r.search(data) if server is not None: server = server.group(1) server = server.rstrip('\r') server = server[:64] # Content-Type r = re.compile('Content-Type: (.*)') cont_type = r.search(data) if cont_type is not None: cont_type = cont_type.group(1) cont_type = cont_type.rstrip('\r') cont_type = cont_type[:128] #print "Time after http parsing: %f" %(time.time(),) # Database statement cursor.execute( """ INSERT INTO pe_dumps(sha1,md5,timestamp,server,client,method,url,host, referer,server_application,content_type,dst_port,corrupt,file_size,file_type) VALUES (%s,%s,TO_TIMESTAMP(%s),%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)""", (sha1, md5, timestamp, srcip, dstip, method, url, host, referer, server, cont_type, dst_port, corrupt_pe, file_size, file_type)) cursor.execute( """ SELECT dump_id FROM pe_dumps where sha1 = %s ORDER BY dump_id DESC LIMIT 1 """, (sha1, )) dump_id = cursor.fetchone()[0] print("A new entry on host:%s has been made in pe_dumps table with " "dump_id %s" % (host, dump_id)) fileHandle.close() cursor.close() conn.close() return dump_id, corrupt_pe, host, dstip, srcip
__author__ = 'vincenzo' from build_weka_database import build from classify import classify_dump import util build(10000) conn = util.connect_to_db() cursor = conn.cursor() cursor.execute("""SELECT max(dump_id) from pe_dumps""") lastID = cursor.fetchone()[0] currID = lastID - 10000 while currID < lastID: classify_dump(currID) currID += 1
def db_virus_total(dump_id): logging.config.fileConfig(LOG_CONF_FILE) logger = logging.getLogger("amico_logger") util.setup_socks() conn = util.connect_to_db() cursor = conn.cursor() # Exit if this sha1 has been queried in the past VT_QUERY_INTERVAL period prev_query_time = datetime(MINYEAR, 1, 1, 0, 0, 0, 0) time_now = datetime.now() cursor.execute(""" SELECT sha1, md5 FROM pe_dumps WHERE dump_id = %s""", (dump_id,)) (sha1, md5) = cursor.fetchone() try: cursor.execute("SELECT query_time, vt_id FROM virus_total_scans " "WHERE sha1 = %s " "ORDER by query_time DESC", (sha1,)) res = cursor.fetchone() if res: prev_query_time = res[0] vt_id = res[1] except: print "sha1:%s no previous VT query" % (sha1, ) pass vt_query_period = timedelta(days=VT_QUERY_INTERVAL) if (time_now - prev_query_time) < vt_query_period: print "sha1:%s has been queried recently. Skipping..." % (sha1, ) cursor.execute(""" INSERT INTO ped_vts_mapping (dump_id, vt_id) VALUES (%s, %s)""", (dump_id, vt_id)) conn.close() return tries = 0 success = False while tries < MAX_TRIES: try: tries += 1 json = vt_api.get_vt_report(md5) if not json: continue report = simplejson.loads(json) if report["response_code"] == 1: insert_report(cursor, report, sha1, md5, json, dump_id) success = True break elif report["response_code"] == 0: cursor.execute(""" INSERT INTO virus_total_scans(sha1, md5, query_time) VALUES (%s, %s, CLOCK_TIMESTAMP()) RETURNING vt_id """, (sha1, md5)) vt_id = cursor.fetchone()[0] cursor.execute(""" INSERT INTO ped_vts_mapping (dump_id, vt_id) VALUES (%s, %s)""", (dump_id, vt_id)) print "Virus Total: No scan report exists in the VT database" success = True break else: logger.exception("Unknown response code! %s" % (report["response_code"],)) time.sleep(1) except Exception as e: print e logger.exception("Try %s. Error in fetching report for md5 %s: %s" % (tries, md5, e)) time.sleep(5) if not success: cursor.execute(""" INSERT INTO ped_vts_mapping (dump_id) VALUES (%s)""", (dump_id,)) logger.warning("Giving up on dump_id: %s's VT report" % (dump_id,)) cursor.close() conn.close()
def db_pe_dumps(file_path, sha1, md5, file_size): # print "Time b4 http parsing: %f" %(time.time(),) # Use Autocommit mode for database connection conn = util.connect_to_db() cursor = conn.cursor() fileHandle = open(file_path) # Timestamp r = re.compile("[0-9]+") timestamp = r.search(fileHandle.readline()) if timestamp is not None: timestamp = timestamp.group() # print timestamp.group() # Source and Destination IPs r = re.compile("([0-9.]+):.*-([0-9.]+):([0-9]+)-.*") ip = r.search(fileHandle.readline()) if ip is not None: srcip = ip.group(2) dstip = ip.group(1) dst_port = ip.group(3) # print ip.group(1) # print ip.group(2) else: srcip = None dstip = None dst_port = None # URL r = re.compile("(GET|POST|HEAD) (.*) ") url = r.search(fileHandle.readline()) if url is not None: method = url.group(1) method = method[:10] url = url.group(2) # print url.group(1) else: method = None # Host r = re.compile("Host: (.*)") host = r.search(fileHandle.readline()) if host is not None: host = host.group(1) host = util.reorder_domain(host.strip()) # print host.group(1) # Referer r = re.compile("Referer: (.*)") referer = r.search(fileHandle.readline()) if referer is not None: referer = referer.group(1) # print referrer.group(1) # CORRUPT_PE corrupt_pe = False r = re.compile("CORRUPT_(PE|FILE)") corrupt_pe_str = r.search(fileHandle.readline()) if corrupt_pe_str is not None: corrupt_pe = True # Now, parse data from the response # Server data = fileHandle.read() r = re.compile("Server: (.*)") server = r.search(data) if server is not None: server = server.group(1) server = server.rstrip("\r") server = server[:64] # Content-Type r = re.compile("Content-Type: (.*)") cont_type = r.search(data) if cont_type is not None: cont_type = cont_type.group(1) cont_type = cont_type.rstrip("\r") cont_type = cont_type[:128] # print "Time after http parsing: %f" %(time.time(),) # Database statement cursor.execute( """ INSERT INTO pe_dumps(sha1,md5,timestamp,server,client,method,url,host, referer,server_application,content_type,dst_port,corrupt,file_size) VALUES (%s,%s,TO_TIMESTAMP(%s),%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)""", ( sha1, md5, timestamp, srcip, dstip, method, url, host, referer, server, cont_type, dst_port, corrupt_pe, file_size, ), ) cursor.execute( """ SELECT dump_id FROM pe_dumps where sha1 = %s ORDER BY dump_id DESC """, (sha1,), ) dump_id = cursor.fetchone()[0] print("A new entry on host:%s has been made in pe_dumps table with " "dump_id %s" % (host, dump_id)) fileHandle.close() cursor.close() conn.close() return dump_id, corrupt_pe
def generate_JSON_map_file(): connection = util.connect_to_db() dictionary_index = 0 monitoring_server_ip = "127.0.0.1" server_host_mapping = defaultdict(set) total_json_map = defaultdict(lambda: defaultdict(lambda: defaultdict(int))) malware_json_map = defaultdict( lambda: defaultdict(lambda: defaultdict(int))) created_json_file = OUT_DIR + "/" + str(MAP_GRAPH_ID) + "-downloads_" + \ datetime.fromtimestamp(time()).strftime('%Y-%m-%d_%H-%M-%S') + ".json" server_host_mapping, total_json_map, malware_json_map = perform_queries_on( connection, server_host_mapping, total_json_map, malware_json_map, "APK", dictionary_index) dictionary_index += 1 server_host_mapping, total_json_map, malware_json_map = perform_queries_on( connection, server_host_mapping, total_json_map, malware_json_map, "DMG", dictionary_index) dictionary_index += 1 server_host_mapping, total_json_map, malware_json_map = perform_queries_on( connection, server_host_mapping, total_json_map, malware_json_map, "ELF", dictionary_index) dictionary_index += 1 server_host_mapping, total_json_map, malware_json_map = perform_queries_on( connection, server_host_mapping, total_json_map, malware_json_map, "EXE", dictionary_index) dictionary_index += 1 server_host_mapping, total_json_map, malware_json_map = perform_queries_on( connection, server_host_mapping, total_json_map, malware_json_map, "PDF", dictionary_index) dictionary_index += 1 server_host_mapping, total_json_map, malware_json_map = perform_queries_on( connection, server_host_mapping, total_json_map, malware_json_map, "SWF", dictionary_index) dictionary_index += 1 server_host_mapping, total_json_map, malware_json_map = perform_queries_on( connection, server_host_mapping, total_json_map, malware_json_map, "JAR", dictionary_index) dictionary_index += 1 server_host_mapping, total_json_map, malware_json_map = perform_queries_on( connection, server_host_mapping, total_json_map, malware_json_map, "RAR", dictionary_index) dictionary_index += 1 server_host_mapping, total_json_map, malware_json_map = perform_queries_on( connection, server_host_mapping, total_json_map, malware_json_map, "ZIP", dictionary_index) dictionary_index += 1 UID = 0 JSON_object = encode_data_as_JSON(UID, monitoring_server_ip, total_json_map, malware_json_map) with open(created_json_file, "wb") as json_file: json.dump(JSON_object, json_file)
def generate_JSON_map_file(): connection = util.connect_to_db() dictionary_index = 0 monitoring_server_ip = "127.0.0.1" server_host_mapping = defaultdict(set) total_json_map = defaultdict(lambda: defaultdict(lambda: defaultdict(int))) malware_json_map = defaultdict(lambda: defaultdict(lambda: defaultdict(int))) created_json_file = OUT_DIR + "/" + str(MAP_GRAPH_ID) + "-downloads_" + \ datetime.fromtimestamp(time()).strftime('%Y-%m-%d_%H-%M-%S') + ".json" server_host_mapping, total_json_map, malware_json_map = perform_queries_on(connection, server_host_mapping, total_json_map, malware_json_map, "APK", dictionary_index) dictionary_index += 1 server_host_mapping, total_json_map, malware_json_map = perform_queries_on(connection, server_host_mapping, total_json_map, malware_json_map, "DMG", dictionary_index) dictionary_index += 1 server_host_mapping, total_json_map, malware_json_map = perform_queries_on(connection, server_host_mapping, total_json_map, malware_json_map, "ELF", dictionary_index) dictionary_index += 1 server_host_mapping, total_json_map, malware_json_map = perform_queries_on(connection, server_host_mapping, total_json_map, malware_json_map, "EXE", dictionary_index) dictionary_index += 1 server_host_mapping, total_json_map, malware_json_map = perform_queries_on(connection, server_host_mapping, total_json_map, malware_json_map, "PDF", dictionary_index) dictionary_index += 1 server_host_mapping, total_json_map, malware_json_map = perform_queries_on(connection, server_host_mapping, total_json_map, malware_json_map, "SWF", dictionary_index) dictionary_index += 1 server_host_mapping, total_json_map, malware_json_map = perform_queries_on(connection, server_host_mapping, total_json_map, malware_json_map, "JAR", dictionary_index) dictionary_index += 1 server_host_mapping, total_json_map, malware_json_map = perform_queries_on(connection, server_host_mapping, total_json_map, malware_json_map, "RAR", dictionary_index) dictionary_index += 1 server_host_mapping, total_json_map, malware_json_map = perform_queries_on(connection, server_host_mapping, total_json_map, malware_json_map, "ZIP", dictionary_index) dictionary_index += 1 UID = 0 JSON_object = encode_data_as_JSON(UID, monitoring_server_ip, total_json_map, malware_json_map) with open(created_json_file, "wb") as json_file: json.dump(JSON_object, json_file)
def get_feature_vector(dump_id): #print "entered get_feature_vector" conn = util.connect_to_db() cursor = conn.cursor() insert_features(cursor, dump_id) print "Done inserting features for dump_id: ", dump_id
def generate_CSV_download_file(): connection = util.connect_to_db() connection_cursor = connection.cursor() csv_writer = None header = "Second_ID,Mal_APK,Tot_APK,Mal_DMG,Tot_DMG,Mal_ELF,Tot_ELF,Mal_EXE,Tot_EXE,Mal_PDF,Tot_PDF,Mal_SWF," + \ "Tot_SWF,Mal_JAR,Tot_JAR,Mal_RAR,Tot_RAR,Mal_ZIP,Tot_ZIP,Timestamp,Next_Download_Event_[s]" header_list = ["Second_ID", "Mal_APK", "Tot_APK", "Mal_DMG", "Tot_DMG", "Mal_ELF", "Tot_ELF", "Mal_EXE", "Tot_EXE", "Mal_PDF", "Tot_PDF", "Mal_SWF", "Tot_SWF", "Mal_JAR", "Tot_JAR", "Mal_RAR", "Tot_RAR", "Mal_ZIP", "Tot_ZIP", "Timestamp", "Next_Download_Event_[s]"] created_csv_file = OUT_DIR + "/" + str(DOWNLOAD_GRAPH_ID) + "-downloads_" + \ datetime.fromtimestamp(time()).strftime('%Y-%m-%d_%H-%M-%S') + ".csv" with open(created_csv_file, "wb") as csv_file: csv_writer = csv.writer(csv_file) csv_writer.writerow(header_list) csv_map = defaultdict(list) malware_timestamp_set = set() ##################################################### EXECUTABLES ##################################################### query = """SELECT timestamp, COUNT(pe.file_type) FROM pe_dumps AS pe, amico_scores AS ams WHERE """ + \ """pe.dump_id = ams.dump_id AND pe.file_type = 'APK' AND ams.score > """ + str(AMICO_THRESHOLD) + \ """GROUP BY timestamp ORDER BY timestamp ASC""" connection_cursor.execute(query) for row in connection_cursor: if row is not None: timestamp = str(row[0]) malware_apk_count_per_second = row[1] csv_map[timestamp].append(malware_apk_count_per_second) malware_timestamp_set.add(timestamp) query = """SELECT timestamp, COUNT(file_type) FROM pe_dumps WHERE file_type = 'APK' GROUP BY timestamp """ + \ """ORDER BY timestamp ASC""" connection_cursor.execute(query) for row in connection_cursor: if row is not None: timestamp = str(row[0]) total_apk_count_per_second = row[1] if malware_timestamp_set.__contains__(timestamp): csv_map[timestamp].append(total_apk_count_per_second) else: csv_map[timestamp].extend([0, total_apk_count_per_second]) malware_timestamp_set = set() query = """SELECT timestamp, COUNT(pe.file_type) FROM pe_dumps AS pe, amico_scores AS ams WHERE """ + \ """pe.dump_id = ams.dump_id AND pe.file_type = 'DMG' AND ams.score > """ + str(AMICO_THRESHOLD) + \ """GROUP BY timestamp ORDER BY timestamp ASC""" connection_cursor.execute(query) for row in connection_cursor: if row is not None: timestamp = str(row[0]) malware_dmg_count_per_second = row[1] if timestamp not in csv_map: csv_map[timestamp].extend([0, 0]) csv_map[timestamp].append(malware_dmg_count_per_second) malware_timestamp_set.add(timestamp) query = """SELECT timestamp, COUNT(file_type) FROM pe_dumps WHERE file_type = 'DMG' GROUP BY timestamp """ + \ """ORDER BY timestamp ASC""" connection_cursor.execute(query) for row in connection_cursor: if row is not None: timestamp = str(row[0]) total_dmg_count_per_second = row[1] if timestamp not in csv_map: csv_map[timestamp].extend([0, 0]) if malware_timestamp_set.__contains__(timestamp): csv_map[timestamp].append(total_dmg_count_per_second) else: csv_map[timestamp].extend([0, total_dmg_count_per_second]) malware_timestamp_set = set() query = """SELECT timestamp, COUNT(pe.file_type) FROM pe_dumps AS pe, amico_scores AS ams WHERE """ + \ """pe.dump_id = ams.dump_id AND pe.file_type = 'ELF' AND ams.score > """ + str(AMICO_THRESHOLD) + \ """GROUP BY timestamp ORDER BY timestamp ASC""" connection_cursor.execute(query) for row in connection_cursor: if row is not None: timestamp = str(row[0]) malware_elf_count_per_second = row[1] if timestamp not in csv_map: csv_map[timestamp].extend([0, 0, 0, 0]) csv_map[timestamp].append(malware_elf_count_per_second) malware_timestamp_set.add(timestamp) query = """SELECT timestamp, COUNT(file_type) FROM pe_dumps WHERE file_type = 'ELF' GROUP BY timestamp """ + \ """ORDER BY timestamp ASC""" connection_cursor.execute(query) for row in connection_cursor: if row is not None: timestamp = str(row[0]) total_elf_count_per_second = row[1] if timestamp not in csv_map: csv_map[timestamp].extend([0, 0, 0, 0]) if malware_timestamp_set.__contains__(timestamp): csv_map[timestamp].append(total_elf_count_per_second) else: csv_map[timestamp].extend([0, total_elf_count_per_second]) malware_timestamp_set = set() query = """SELECT timestamp, COUNT(pe.file_type) FROM pe_dumps AS pe, amico_scores AS ams WHERE """ + \ """pe.dump_id = ams.dump_id AND pe.file_type = 'EXE' AND ams.score > """ + str(AMICO_THRESHOLD) + \ """GROUP BY timestamp ORDER BY timestamp ASC""" connection_cursor.execute(query) for row in connection_cursor: if row is not None: timestamp = str(row[0]) malware_exe_count_per_second = row[1] if timestamp not in csv_map: csv_map[timestamp].extend([0, 0, 0, 0, 0, 0]) csv_map[timestamp].append(malware_exe_count_per_second) malware_timestamp_set.add(timestamp) query = """SELECT timestamp, COUNT(file_type) FROM pe_dumps WHERE file_type = 'EXE' GROUP BY timestamp """ + \ """ORDER BY timestamp ASC""" connection_cursor.execute(query) for row in connection_cursor: if row is not None: timestamp = str(row[0]) total_exe_count_per_second = row[1] if timestamp not in csv_map: csv_map[timestamp].extend([0, 0, 0, 0, 0, 0]) if malware_timestamp_set.__contains__(timestamp): csv_map[timestamp].append(total_exe_count_per_second) else: csv_map[timestamp].extend([0, total_exe_count_per_second]) ######################################################################################################################## ######################################################### PDF ######################################################### malware_timestamp_set = set() query = """SELECT timestamp, COUNT(pe.file_type) FROM pe_dumps AS pe, amico_scores AS ams WHERE """ + \ """pe.dump_id = ams.dump_id AND pe.file_type = 'PDF' AND ams.score > """ + str(AMICO_THRESHOLD) + \ """GROUP BY timestamp ORDER BY timestamp ASC""" connection_cursor.execute(query) for row in connection_cursor: if row is not None: timestamp = str(row[0]) malware_pdf_count_per_second = row[1] if timestamp not in csv_map: csv_map[timestamp].extend([0, 0, 0, 0, 0, 0, 0, 0]) csv_map[timestamp].append(malware_pdf_count_per_second) malware_timestamp_set.add(timestamp) query = """SELECT timestamp, COUNT(file_type) FROM pe_dumps WHERE file_type = 'PDF' GROUP BY timestamp """ + \ """ORDER BY timestamp ASC""" connection_cursor.execute(query) for row in connection_cursor: if row is not None: timestamp = str(row[0]) total_pdf_count_per_second = row[1] if timestamp not in csv_map: csv_map[timestamp].extend([0, 0, 0, 0, 0, 0, 0, 0]) if malware_timestamp_set.__contains__(timestamp): csv_map[timestamp].append(total_pdf_count_per_second) else: csv_map[timestamp].extend([0, total_pdf_count_per_second]) ######################################################################################################################## ######################################################## FLASH ######################################################## malware_timestamp_set = set() query = """SELECT timestamp, COUNT(pe.file_type) FROM pe_dumps AS pe, amico_scores AS ams WHERE """ + \ """pe.dump_id = ams.dump_id AND pe.file_type = 'SWF' AND ams.score > """ + str(AMICO_THRESHOLD) + \ """GROUP BY timestamp ORDER BY timestamp ASC""" connection_cursor.execute(query) for row in connection_cursor: if row is not None: timestamp = str(row[0]) malware_swf_count_per_second = row[1] if timestamp not in csv_map: csv_map[timestamp].extend([0, 0, 0, 0, 0, 0, 0, 0, 0, 0]) csv_map[timestamp].append(malware_swf_count_per_second) malware_timestamp_set.add(timestamp) query = """SELECT timestamp, COUNT(file_type) FROM pe_dumps WHERE file_type = 'SWF' GROUP BY timestamp """ + \ """ORDER BY timestamp ASC""" connection_cursor.execute(query) for row in connection_cursor: if row is not None: timestamp = str(row[0]) total_swf_count_per_second = row[1] if timestamp not in csv_map: csv_map[timestamp].extend([0, 0, 0, 0, 0, 0, 0, 0, 0, 0]) if malware_timestamp_set.__contains__(timestamp): csv_map[timestamp].append(total_swf_count_per_second) else: csv_map[timestamp].extend([0, total_swf_count_per_second]) ######################################################################################################################## ###################################################### COMPRESSED ###################################################### malware_timestamp_set = set() query = """SELECT timestamp, COUNT(pe.file_type) FROM pe_dumps AS pe, amico_scores AS ams WHERE """ + \ """pe.dump_id = ams.dump_id AND pe.file_type = 'JAR' AND ams.score > """ + str(AMICO_THRESHOLD) + \ """GROUP BY timestamp ORDER BY timestamp ASC""" connection_cursor.execute(query) for row in connection_cursor: if row is not None: timestamp = str(row[0]) malware_jar_count_per_second = row[1] if timestamp not in csv_map: csv_map[timestamp].extend([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]) csv_map[timestamp].append(malware_jar_count_per_second) malware_timestamp_set.add(timestamp) query = """SELECT timestamp, COUNT(file_type) FROM pe_dumps WHERE file_type = 'JAR' GROUP BY timestamp """ + \ """ORDER BY timestamp ASC""" connection_cursor.execute(query) for row in connection_cursor: if row is not None: timestamp = str(row[0]) total_jar_count_per_second = row[1] if timestamp not in csv_map: csv_map[timestamp].extend([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]) if malware_timestamp_set.__contains__(timestamp): csv_map[timestamp].append(total_jar_count_per_second) else: csv_map[timestamp].extend([0, total_jar_count_per_second]) malware_timestamp_set = set() query = """SELECT timestamp, COUNT(pe.file_type) FROM pe_dumps AS pe, amico_scores AS ams WHERE """ + \ """pe.dump_id = ams.dump_id AND pe.file_type = 'RAR' AND ams.score > """ + str(AMICO_THRESHOLD) + \ """GROUP BY timestamp ORDER BY timestamp ASC""" connection_cursor.execute(query) for row in connection_cursor: if row is not None: timestamp = str(row[0]) malware_rar_count_per_second = row[1] if timestamp not in csv_map: csv_map[timestamp].extend([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]) csv_map[timestamp].append(malware_rar_count_per_second) malware_timestamp_set.add(timestamp) query = """SELECT timestamp, COUNT(file_type) FROM pe_dumps WHERE file_type = 'RAR' GROUP BY timestamp """ + \ """ORDER BY timestamp ASC""" connection_cursor.execute(query) for row in connection_cursor: if row is not None: timestamp = str(row[0]) total_rar_count_per_second = row[1] if timestamp not in csv_map: csv_map[timestamp].extend([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]) if malware_timestamp_set.__contains__(timestamp): csv_map[timestamp].append(total_rar_count_per_second) else: csv_map[timestamp].extend([0, total_rar_count_per_second]) malware_timestamp_set = set() query = """SELECT timestamp, COUNT(pe.file_type) FROM pe_dumps AS pe, amico_scores AS ams WHERE """ + \ """pe.dump_id = ams.dump_id AND pe.file_type = 'ZIP' AND ams.score > """ + str(AMICO_THRESHOLD) + \ """GROUP BY timestamp ORDER BY timestamp ASC""" connection_cursor.execute(query) for row in connection_cursor: if row is not None: timestamp = str(row[0]) malware_zip_count_per_second = row[1] if timestamp not in csv_map: csv_map[timestamp].extend([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]) csv_map[timestamp].append(malware_zip_count_per_second) malware_timestamp_set.add(timestamp) query = """SELECT timestamp, COUNT(file_type) FROM pe_dumps WHERE file_type = 'ZIP' GROUP BY timestamp """ + \ """ORDER BY timestamp ASC""" connection_cursor.execute(query) for row in connection_cursor: if row is not None: timestamp = str(row[0]) total_zip_count_per_second = row[1] if timestamp not in csv_map: csv_map[timestamp].extend([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]) if malware_timestamp_set.__contains__(timestamp): csv_map[timestamp].append(total_zip_count_per_second) else: csv_map[timestamp].extend([0, total_zip_count_per_second]) ######################################################################################################################## sorted_csv_map = sorted(csv_map.items(), key=operator.itemgetter(0)) csv_map_aux = defaultdict(list) first_useful_date = "2014-11-26 22:55:40" last_useful_date = "2015-10-01 00:00:00" # Loop for handling corrupted timestamp for timestamp, file_list in sorted_csv_map: if cmp(timestamp, first_useful_date) < 0: timestamp_split = timestamp.split() first_useful_date_split = first_useful_date.split() timestamp_hms = timestamp_split[1] first_useful_date_ymd = first_useful_date_split[0] corrected_timestamp = first_useful_date_ymd + " " + timestamp_hms csv_map_aux[corrected_timestamp] = csv_map.get(timestamp) else: break max_values = len(header.split(',')) - 2 csv_rows = list() sorted_csv_map_aux = sorted(csv_map_aux.items(), key=operator.itemgetter(0)) UID = 0 for timestamp, file_list in sorted_csv_map_aux: formatted_row = format_row(file_list, max_values) formatted_row.insert(0, UID) formatted_row.append(timestamp) csv_rows.append(formatted_row) UID += 1 writable_csv_rows = list() while csv_rows: current_row = csv_rows.pop(0) if not csv_rows: writable_csv_rows.append(current_row) continue next_row = csv_rows[0] timestamp_index = len(current_row) - 1 current_timestamp_string = current_row[timestamp_index] next_timestamp_string = next_row[timestamp_index] current_timestamp = datetime.strptime(current_timestamp_string, '%Y-%m-%d %H:%M:%S') next_timestamp = datetime.strptime(next_timestamp_string, '%Y-%m-%d %H:%M:%S') time_delta_in_secs = int((next_timestamp - current_timestamp).total_seconds()) - 1 current_row.append(time_delta_in_secs) writable_csv_rows.append(current_row) writable_sorted_csv_map = list() for timestamp, file_list in sorted_csv_map: if cmp(timestamp, first_useful_date) < 0 or cmp(timestamp, last_useful_date) > 0: continue else: writable_sorted_csv_map.append([timestamp, file_list]) writable_csv_rows_aux = list() while writable_sorted_csv_map: timestamp_file_list_first_pair = writable_sorted_csv_map.pop(0) timestamp_str_first_pair = timestamp_file_list_first_pair[0] file_list_first_pair = timestamp_file_list_first_pair[1] if not writable_sorted_csv_map: formatted_row = format_row(file_list_first_pair, max_values) formatted_row.insert(0, UID) formatted_row.append(timestamp_str_first_pair) writable_csv_rows_aux.append(formatted_row) UID += 1 continue timestamp_file_list_second_pair = writable_sorted_csv_map[0] timestamp_str_second_pair = timestamp_file_list_second_pair[0] formatted_row = format_row(file_list_first_pair, max_values) formatted_row.insert(0, UID) formatted_row.append(timestamp_str_first_pair) timestamp_first_pair = datetime.strptime(timestamp_str_first_pair, '%Y-%m-%d %H:%M:%S') timestamp_second_pair = datetime.strptime(timestamp_str_second_pair, '%Y-%m-%d %H:%M:%S') time_delta_in_secs = int((timestamp_second_pair - timestamp_first_pair).total_seconds()) - 1 formatted_row.append(time_delta_in_secs) writable_csv_rows_aux.append(formatted_row) UID += 1 last_formatted_row_in_writable_csv_rows = writable_csv_rows.pop(len(writable_csv_rows) - 1) first_formatted_row_in_writable_csv_rows_aux = writable_csv_rows_aux[0] timestamp_index = len(last_formatted_row_in_writable_csv_rows) - 1 current_timestamp_string = last_formatted_row_in_writable_csv_rows[timestamp_index] next_timestamp_string = first_formatted_row_in_writable_csv_rows_aux[timestamp_index] current_timestamp = datetime.strptime(current_timestamp_string, '%Y-%m-%d %H:%M:%S') next_timestamp = datetime.strptime(next_timestamp_string, '%Y-%m-%d %H:%M:%S') time_delta_in_secs = int((next_timestamp - current_timestamp).total_seconds()) - 1 last_formatted_row_in_writable_csv_rows.append(time_delta_in_secs) writable_csv_rows_aux.insert(0, last_formatted_row_in_writable_csv_rows) with open(created_csv_file, "a") as csv_file: csv_writer = csv.writer(csv_file, csv.QUOTE_NONNUMERIC) for row in writable_csv_rows: csv_writer.writerow(row) for row in writable_csv_rows_aux: csv_writer.writerow(row)
def fe_db_setup(): conn = util.connect_to_db() cursor = conn.cursor() cursor.execute(""" DROP table if exists features""") cursor.execute(""" DROP table if exists weka_features""") cursor.execute( """ CREATE TABLE weka_features( dump_id INT, raw_dump_num_av_labels INT, raw_dump_trusted_av_labels INT, vt_month_shelf BOOLEAN, corrupt BOOLEAN, host_malware_downloads INT, host_suspicious_downloads INT, host_benign_downloads INT, host_total_downloads INT, host_malware_ratio REAL, host_suspicious_ratio REAL, host_benign_ratio REAL, host_avg_av_labels REAL, host_avg_trusted_labels REAL, host_unknown_hashes INT, host_total_hashes INT, host_unknown_hash_ratio REAL, twold_malware_downloads INT, twold_suspicious_downloads INT, twold_benign_downloads INT, twold_total_downloads INT, twold_malware_ratio REAL, twold_suspicious_ratio REAL, twold_benign_ratio REAL, twold_avg_av_labels REAL, twold_avg_trusted_labels REAL, twold_unknown_hashes INT, twold_total_hashes INT, twold_unknown_hash_ratio REAL, server_ip_malware_downloads INT, server_ip_suspicious_downloads INT, server_ip_benign_downloads INT, server_ip_total_downloads INT, server_ip_malware_ratio REAL, server_ip_suspicious_ratio REAL, server_ip_benign_ratio REAL, server_ip_avg_av_labels REAL, server_ip_avg_trusted_labels REAL, server_ip_unknown_hashes INT, server_ip_total_hashes INT, server_ip_unknown_hash_ratio REAL, bgp_malware_downloads INT, bgp_suspicious_downloads INT, bgp_benign_downloads INT, bgp_total_downloads INT, bgp_malware_ratio REAL, bgp_suspicious_ratio REAL, bgp_benign_ratio REAL, bgp_avg_av_labels REAL, bgp_avg_trusted_labels REAL, bgp_unknown_hashes INT, bgp_total_hashes INT, bgp_unknown_hash_ratio REAL, hash_life_time INT, num_dumps_with_same_hash INT, hash_daily_dump_rate_per_client REAL, estimated_clients_with_same_hash INT, referer_exists INT, host_name_exists INT, extension_class VARCHAR(20), url_length INT, directory_depth INT, sha1 VARCHAR(40), host VARCHAR(256), url_malware_downloads INT, url_total_downloads INT, url_distinct_sha1s INT, url_struct VARCHAR(512), url_struct_malware_downloads INT, url_struct_total_downloads INT, url_struct_distinct_sha1s INT) """ ) print "Created weka_features table!" conn.commit() cursor.close() conn.close()