def update_samples_label(repo): db = DatabaseHandler() samples = db.select_sample_all() for item in samples: isSend = False while not isSend: lable = make_virus_total_request(item[1].split('.')[0]) if 'Forbidden' != lable: shash = unicodedata.normalize('NFKD', item[1]).encode( 'ascii', 'ignore') rowcount = db.update_sample_lable(shash, lable) print item[0], ' -> ', item[ 1], " : ", lable, ' RowCount : ', str(rowcount) if (int(lable) == 0): copyfile(repo + item[1], repo + "0/" + item[1]) elif (int(lable) == 1): copyfile(repo + item[1], repo + "1/" + item[1]) elif int(lable) > 1 and int(lable) <= 5: copyfile(repo + item[1], repo + "5/" + item[1]) elif int(lable) > 5 and int(lable) <= 10: copyfile(repo + item[1], repo + "10/" + item[1]) else: copyfile(repo + item[1], repo + "more/" + item[1]) isSend = True else: print item[0], ' -> ', item[1], ' : Forbidden' time.sleep(120)
def menu_select(): db = DatabaseHandler() repo = '/Users/midnightgeek/Repo/11/l12/' dump_Method_dir = '/Users/midnightgeek/Tools/test2' print '********* DataSet Generator *********' print 'Enter 1 For Run LDA' print 'Enter 2 For Fill Samples Table' print 'Enter 3 For Lable Sample With VT Api' print 'Enter 4 For Clear Samples Table' print 'Enter 5 For capture Image' print 'Enter 6 For Run n-opcode' menu = raw_input("Enter Number : ") if menu == '1': run_whole_process(repo, dump_Method_dir) elif menu == '2': fill_samples_table(repo, dump_Method_dir) elif menu == '3': update_samples_label(repo) elif menu == '4': db.clear_table_samples() elif menu == '5': fill_samples_table(repo) capture_image(repo, dump_Method_dir) elif menu == '6': n_opcode_progress(repo, dump_Method_dir) else: print 'Wrong Number'
def menu_select(): db = DatabaseHandler() repo = raw_input("Directory Address : ") if not repo.endswith('/'): repo = repo + '/' try: os.mkdir(repo + 'dec_bank') except OSError: pass dump_method_dir = repo + 'dec_bank' while True: print '********* DataSet Generator *********' print 'Enter 1 For Fill Samples Table' print 'Enter 2 For Label Sample With VT Api' print 'Enter 3 For Upcode sequence Generator' print 'Enter 4 For Clear Samples Table' print 'Enter 5 For Exit' menu = raw_input("Enter Number : ") if menu == '1': fill_samples_table(repo) elif menu == '2': update_samples_label(repo) elif menu == '3': opcode_sequence_generator(repo, dump_method_dir) elif menu == '4': db.clear_table_Dataset() elif menu == '5': break else: print 'Wrong Number'
def opcode_sequence_generator5(repo, dumpMethodDir): db = DatabaseHandler() samples = db.select_sample_all() for i in range(2, 11): vector = [] sample_mal = [] sample_bin = [] seen = set() for item in samples: try: if item[1].endswith(".apk"): dump_all_method(repo + item[1], dumpMethodDir) opcode_sequence = check_opcode(dumpMethodDir, i) # Add opcode seq to class belong if item[1].startswith('bin_') and item[1].endswith(".apk"): sample_bin.append(opcode_sequence) elif item[1].endswith(".apk"): sample_mal.append(opcode_sequence) # Generate a Sequence banck for item in opcode_sequence: if item not in seen: vector.append(item) seen.add(item) except Exception as e: print e write_arff_n_opcode(i, repo + str(i) + '_result.arff', sample_mal, sample_bin)
def opcode_sequence_generator(repo, dump_method_dir): db = DatabaseHandler() samples = db.select_sample_all() unique_opcode_sequence = [] sample_mal_opcode_sequence = [] sample_bin_opcode_sequence = [] sample_mal_opcode = [] sample_bin_opcode = [] sample_bin_name = [] sample_mal_name = [] seen = set() logger("Start Process : " + str(len(samples))) indexer = 1 for sample_item in samples: try: logger("++++++ Progress : " + sample_item[1] + " -> " + str(indexer) + "/" + str(len(samples))) indexer = indexer + 1 # Generate Opcode Seq for every sample logger("Start Sample : " + sample_item[1]) if sample_item[1].endswith(".apk"): logger("Start dumping") dump_all_method(repo + sample_item[1], dump_method_dir) logger("End dumping") opcode_list = extract_opcode(dump_method_dir) opcode_sequence = construct_opcode_sequence( dump_method_dir, opcode_list, 2) # Add opcode seq to class belong if sample_item[1].startswith( 'bin_') and sample_item[1].endswith(".apk"): sample_bin_opcode_sequence.append(opcode_sequence) sample_bin_opcode.append(opcode_list) sample_bin_name.append(sample_item[1]) elif sample_item[1].endswith(".apk"): sample_mal_opcode_sequence.append(opcode_sequence) sample_mal_opcode.append(opcode_list) sample_mal_name.append(sample_item[1]) # Generate a Sequence banck for opcode_item in opcode_sequence: if opcode_item not in seen: unique_opcode_sequence.append(opcode_item) seen.add(opcode_item) logger("End Sample") except Exception as e: print e mal_class = weight_function(sample_mal_opcode_sequence, sample_mal_opcode, unique_opcode_sequence) bin_class = weight_function(sample_bin_opcode_sequence, sample_bin_opcode, unique_opcode_sequence) for top_count in range(1, 10): mal_class_sum = featureSelection(mal_class, unique_opcode_sequence, top_count * 10) bin_class_sum = featureSelection(bin_class, unique_opcode_sequence, top_count * 10) write_arff_csv(repo + 'result' + str(top_count * 10) + '.csv', mal_class, bin_class, mal_class_sum + bin_class_sum) write_arff(repo + 'result' + str(top_count * 10) + '.arff', mal_class, bin_class, mal_class_sum + bin_class_sum)
def opcode_sequence_generator6(repo, dumpMethodDir): db = DatabaseHandler() samples = db.select_sample_all() vector = [] sample_bin_banck = [] sample_mal_banck = [] seen = set() for item in samples: sample_mal = [] sample_bin = [] type = 1 try: if item[1].endswith(".apk"): dump_all_method(repo + item[1], dumpMethodDir) for i in range(2, 11): opcode_sequence = check_opcode(dumpMethodDir, i) # Add opcode seq to class belong if item[1].startswith('bin_') and item[1].endswith(".apk"): sample_bin.append(opcode_sequence) type = 1 elif item[1].endswith(".apk"): sample_mal.append(opcode_sequence) type = 2 # Generate a Sequence banck if type == 1: sample_bin_banck.append(sample_bin) else: sample_mal_banck.append(sample_mal) clean_up_folder(dumpMethodDir) except Exception as e: print e for x in range(0, 9): sample_mal_1 = [] sample_bin_1 = [] for y in range(0, len(sample_bin_banck)): sample_bin_1.append(sample_bin_banck[y][x]) for y in range(0, len(sample_mal_banck)): sample_mal_1.append(sample_mal_banck[y][x]) mal_class_w = func_weight_freq(sample_mal_1) bin_class_w = func_weight_freq(sample_bin_1) write_arff_n_opcode(x + 2, repo + str(x + 2) + '_result.arff', mal_class_w, bin_class_w)
def fill_samples_table(repo): db = DatabaseHandler() db.recreats_table_samples() files = get_all_files_in_directory(repo) for afile in files: try: if '.DS_Store' not in afile: db.insert_a_sample(afile, '') except Exception as e: print e
def fill_samples_table(repo): db = DatabaseHandler() db.recreats_table_samples() files = get_all_files_in_directory(repo) for items in files: try: if str(items).endswith('.apk'): db.insert_a_sample(items, '') except Exception as e: print e
def opcode_sequence_generator4(repo, dumpMethodDir): db = DatabaseHandler() samples = db.select_sample_all() vector = [] sample_mal = [] sample_bin = [] sample_mal_1 = [] sample_bin_1 = [] sample_bin_name = [] sample_mal_name = [] seen = set() for item in samples: try: # Generate Opcode Seq for every sample if item[1].endswith(".apk"): dump_all_method(repo + item[1], dumpMethodDir) opcode_sequence = check_opcode(dumpMethodDir, 2) opcode_list1 = check_opcode2(dumpMethodDir) # Add opcode seq to class belong if item[1].startswith('bin_') and item[1].endswith(".apk"): sample_bin.append(opcode_sequence) sample_bin_1.append(opcode_list1) sample_bin_name.append(item[1]) elif item[1].endswith(".apk"): sample_mal.append(opcode_sequence) sample_mal_1.append(opcode_list1) sample_mal_name.append(item[1]) # Generate a Sequence banck for item in opcode_sequence: if item not in seen: vector.append(item) seen.add(item) except Exception as e: print e mal_class = [] bin_class = [] mal_class = func_weight_p_op1_op2(sample_mal, sample_mal_1, vector) bin_class = func_weight_p_op1_op2(sample_bin, sample_bin_1, vector) write_arff(repo + 'result.arff', mal_class, bin_class) output_filename = repo + 'resultLDA.txt' simple_result = repo + 'Expenses01.xlsx' fp_lda = open(output_filename, "w") workbook = xlsxwriter.Workbook(simple_result) worksheet = workbook.add_worksheet() n_fold = 5 top_edge = [] for i in range(2, 250): top_edge.append(i) row_index = 0 for top in top_edge: total_tp = 0 total_tn = 0 total_fp = 0 total_fn = 0 total_acc = 0 total_tpr = 0 total_fpr = 0 total_final_set = 0 name = "************** TOP" + str(top) + " **************" fp_lda.write(name) fp_lda.write('\n') test_count_mal = int(len(mal_class) / n_fold) test_count_bin = int(len(bin_class) / n_fold) p_bin = 0 p_mal = 0 for fold in range(1, n_fold + 1): train_mal_class = [] train_bin_class = [] test_mal_class = [] test_bin_class = [] test_mal_name = [] test_bin_name = [] for i in range(0, len(bin_class)): if i >= p_bin * test_count_bin and i < p_bin * test_count_bin + test_count_bin: test_bin_class.append(bin_class[i]) test_bin_name.append(sample_bin_name[i]) else: train_bin_class.append(bin_class[i]) p_bin = p_bin + 1 for i in range(0, len(mal_class)): if i >= p_mal * test_count_mal and i < p_mal * test_count_mal + test_count_mal: test_mal_class.append(mal_class[i]) test_mal_name.append(sample_mal_name[i]) else: train_mal_class.append(mal_class[i]) p_mal = p_mal + 1 # calculate MIN mal class for every feature MIN_total = {} total_len = len(train_mal_class) + len(train_bin_class) print "start Calculate Mean Malware Class" MIN_mal = {} for feature in vector: sum_feature = 0 for item in train_mal_class: if feature in item: sum_feature = item[feature] + sum_feature MIN_mal[feature] = sum_feature / len(train_mal_class) MIN_total[feature] = sum_feature print "start Calculate Mean Bin Class" MIN_bin = {} for feature in vector: sum_feature = 0 for item in train_bin_class: if feature in item: sum_feature = item[feature] + sum_feature MIN_bin[feature] = sum_feature / len(train_bin_class) MIN_total[feature] = (MIN_total[feature] + sum_feature) / total_len print "start Calculate SW" # Calculate SW SW = {} for feature in vector: sum_feature = 0 for item in train_mal_class: if feature in item and feature in MIN_mal: X = item[feature] - MIN_mal[feature] elif feature in item: X = item[feature] elif feature in MIN_mal: X = MIN_mal[feature] else: X = 0 Y = X * X sum_feature = sum_feature + Y for item in train_bin_class: if feature in item and feature in MIN_bin: X = item[feature] - MIN_bin[feature] elif feature in item: X = item[feature] elif feature in MIN_bin: X = MIN_bin[feature] else: X = 0 Y = X * X sum_feature = sum_feature + Y SW[feature] = sum_feature # Calculate SB print "start Calculate Mean SB" malware_persentage = len(train_mal_class) * 100 / total_len binware_persentage = len(train_mal_class) * 100 / total_len SB = {} for features in vector: if feature in MIN_mal and feature in MIN_bin: total_mean = MIN_total[features] SB[features] = (malware_persentage * (MIN_mal[features] - total_mean) * (MIN_mal[features] - total_mean)) + ( binware_persentage * (MIN_bin[features] - total_mean) * (MIN_bin[features] - total_mean)) elif feature in MIN_bin: total_mean = MIN_total[features] SB[features] = ( malware_persentage * (0 - total_mean) * (0 - total_mean)) + (binware_persentage * (MIN_bin[features] - total_mean) * (MIN_bin[features] - total_mean)) elif feature in MIN_mal: total_mean = MIN_total[features] SB[features] = (malware_persentage * (MIN_mal[features] - total_mean) * (MIN_mal[features] - total_mean)) + ( binware_persentage * (0 - total_mean) * (0 - total_mean)) else: total_mean = 0 SB[features] = (malware_persentage * (0 - total_mean) * (0 - total_mean)) + (binware_persentage * (0 - total_mean) * (0 - total_mean)) # Calculate ST print "start Calculate ST" ST = {} for item in vector: if SW[item] != 0: ST[item] = (SB[item]) / SW[item] else: ST[item] = 0 select_top = sorted(ST.iteritems(), key=lambda x: -x[1], reverse=False)[:top] final_op_set = [] opcode_bank = {} index_helper_x = 0 seen = set() for key, value in select_top: splitter = key.strip().split() if splitter[0] not in seen: final_op_set.append(splitter[0]) opcode_bank[splitter[0]] = index_helper_x index_helper_x = index_helper_x + 1 seen.add(splitter[0]) if splitter[1] not in seen: final_op_set.append(splitter[1]) opcode_bank[splitter[1]] = index_helper_x index_helper_x = index_helper_x + 1 seen.add(splitter[1]) len_train = len(train_bin_class) + len(train_mal_class) test_set_mal = np.zeros( (len(test_mal_class), len(final_op_set) * len(final_op_set))) test_set_bin = np.zeros( (len(test_bin_class), len(final_op_set) * len(final_op_set))) train_set = np.zeros( (len_train, len(final_op_set) * len(final_op_set))) train_lable = [] index_train = 0 for item in train_mal_class: image = np.array([[1.0 for j in range(len(final_op_set))] for i in range(len(final_op_set))]) for opc_i in final_op_set: for opc_j in final_op_set: x = opcode_bank[opc_i] y = opcode_bank[opc_j] key = str(str(opc_i) + " " + str(opc_j)) if key in item: image[x][y] = item[str(opc_i) + " " + str(opc_j)] else: image[x][y] = 0 train_set[index_train] = image.flatten() train_lable.append(1) index_train = index_train + 1 for item in train_bin_class: image = np.array([[1.0 for j in range(len(final_op_set))] for i in range(len(final_op_set))]) for opc_i in final_op_set: for opc_j in final_op_set: x = opcode_bank[opc_i] y = opcode_bank[opc_j] key = str(str(opc_i) + " " + str(opc_j)) if key in item: image[x][y] = item[str(opc_i) + " " + str(opc_j)] else: image[x][y] = 0 train_set[index_train] = image.flatten() train_lable.append(0) index_train = index_train + 1 index_test = 0 for item in test_mal_class: image = np.array([[1.0 for j in range(len(final_op_set))] for i in range(len(final_op_set))]) for opc_i in final_op_set: for opc_j in final_op_set: x = opcode_bank[opc_i] y = opcode_bank[opc_j] key = str(str(opc_i) + " " + str(opc_j)) if key in item: image[x][y] = item[str(opc_i) + " " + str(opc_j)] else: image[x][y] = 0 test_set_mal[index_test] = image.flatten() index_test = index_test + 1 index_test = 0 for item in test_bin_class: image = np.array([[1.0 for j in range(len(final_op_set))] for i in range(len(final_op_set))]) for opc_i in final_op_set: for opc_j in final_op_set: x = opcode_bank[opc_i] y = opcode_bank[opc_j] key = str(str(opc_i) + " " + str(opc_j)) if key in item: image[x][y] = item[str(opc_i) + " " + str(opc_j)] else: image[x][y] = 0 test_set_bin[index_test] = image.flatten() index_test = index_test + 1 clf = LinearDiscriminantAnalysis() clf.fit(train_set, train_lable) tp = 0 tn = 0 fp = 0 fn = 0 fn_name = [] fp_name = [] index_name = 0 for item in test_set_mal: result = clf.predict(item.reshape(1, -1)) if result == 1: tp = tp + 1 else: fn = fn + 1 fn_name.append(test_mal_name[index_name]) index_name = index_name + 1 index_name = 0 for item in test_set_bin: result = clf.predict(item.reshape(1, -1)) if result == 0: tn = tn + 1 else: fp = fp + 1 fp_name.append(test_bin_name[index_name]) index_name = index_name + 1 acc = (tp + tn) / (tp + tn + fp + fn) tpr = (tp) / (tp + fn) fpr = (fp) / (fp + tn) fp_lda.write('\n') fp_lda.write('TP : ' + str(tp)) fp_lda.write('\n') fp_lda.write('TN : ' + str(tn)) fp_lda.write('\n') fp_lda.write('FP : ' + str(fp)) fp_lda.write('\n') fp_lda.write('FN : ' + str(fn)) fp_lda.write('\n') fp_lda.write('ACC : ' + str(acc)) fp_lda.write('\n') fp_lda.write('LEN : ' + str(len(final_op_set))) fp_lda.write('\n') for item in fp_name: fp_lda.write('fp_name : ' + str(item)) fp_lda.write('\n') for item in fn_name: fp_lda.write('fn_name : ' + str(item)) fp_lda.write('\n') total_tp = total_tp + tp total_tn = total_tn + tn total_fp = total_fp + fp total_fn = total_fn + fn total_acc = total_acc + acc total_tpr = total_tpr + tpr total_fpr = total_fpr + fpr total_final_set = len(final_op_set) + total_final_set col_index = 0 worksheet.write(row_index, col_index, total_tp / fold) col_index = col_index + 1 worksheet.write(row_index, col_index, total_fp / fold) col_index = col_index + 1 worksheet.write(row_index, col_index, total_tn / fold) col_index = col_index + 1 worksheet.write(row_index, col_index, total_fn / fold) col_index = col_index + 1 worksheet.write(row_index, col_index, total_tpr / fold) col_index = col_index + 1 worksheet.write(row_index, col_index, total_fpr / fold) col_index = col_index + 1 worksheet.write(row_index, col_index, total_acc / fold) col_index = col_index + 1 worksheet.write(row_index, col_index, top) col_index = col_index + 1 worksheet.write(row_index, col_index, total_final_set / fold) col_index = col_index + 1 row_index = row_index + 1
def capture_image(repo, dump_method_dir): db = DatabaseHandler() samples = db.select_sample_all() vector = [] sample = [] sample_name = [] sample_1 = [] seen = set() for item in samples: try: # Generate Opcode Seq for every sample dump_all_method(repo + item[1], dump_method_dir) opcode_sequence = check_opcode(dump_method_dir, 2) opcode_list1 = check_opcode2(dump_method_dir) # Add opcode seq to class belong if item[1].endswith(".apk"): sample.append(opcode_sequence) sample_1.append(opcode_list1) sample_name.append(item[1]) for item in opcode_sequence: if item not in seen: vector.append(item) seen.add(item) except Exception as e: print e sample_class = [] sample_class = func_weight_p_op1_op2(sample, sample_1, vector) final_op_set = [] opcode_bank = {} index_helper_x = 0 seen = set() for item in sample_class: for key, value in item.iteritems(): splitter = key.strip().split() if splitter[0] not in seen: final_op_set.append(splitter[0]) opcode_bank[splitter[0]] = index_helper_x index_helper_x = index_helper_x + 1 seen.add(splitter[0]) if splitter[1] not in seen: final_op_set.append(splitter[1]) opcode_bank[splitter[1]] = index_helper_x index_helper_x = index_helper_x + 1 seen.add(splitter[1]) index_name = 0 for item in sample_class: image = np.array([[0.0 for j in range(256)] for i in range(256)]) for opc_i in final_op_set: for opc_j in final_op_set: x = opcode_bank[opc_i] y = opcode_bank[opc_j] key = str(str(opc_i) + " " + str(opc_j)) if key in item: image[x][y] = item[str(opc_i) + " " + str(opc_j)] else: image[x][y] = 0 rescaled = (255.0 / image.max() * (image - image.min())).astype( np.uint8) im = Image.fromarray(rescaled) im.show() im.save(str(sample_name[index_name]) + '.png', 'PNG') index_name = index_name + 1
def __init__(self): self.db = DatabaseHandler()