Example #1
0
def update_samples_label(repo):
    db = DatabaseHandler()
    samples = db.select_sample_all()
    for item in samples:
        isSend = False
        while not isSend:
            lable = make_virus_total_request(item[1].split('.')[0])
            if 'Forbidden' != lable:
                shash = unicodedata.normalize('NFKD', item[1]).encode(
                    'ascii', 'ignore')
                rowcount = db.update_sample_lable(shash, lable)
                print item[0], ' -> ', item[
                    1], " : ", lable, ' RowCount : ', str(rowcount)
                if (int(lable) == 0):
                    copyfile(repo + item[1], repo + "0/" + item[1])
                elif (int(lable) == 1):
                    copyfile(repo + item[1], repo + "1/" + item[1])
                elif int(lable) > 1 and int(lable) <= 5:
                    copyfile(repo + item[1], repo + "5/" + item[1])
                elif int(lable) > 5 and int(lable) <= 10:
                    copyfile(repo + item[1], repo + "10/" + item[1])
                else:
                    copyfile(repo + item[1], repo + "more/" + item[1])
                isSend = True
            else:
                print item[0], ' -> ', item[1], ' : Forbidden'
                time.sleep(120)
Example #2
0
def menu_select():
    db = DatabaseHandler()
    repo = '/Users/midnightgeek/Repo/11/l12/'
    dump_Method_dir = '/Users/midnightgeek/Tools/test2'
    print '********* DataSet Generator *********'
    print 'Enter 1 For Run LDA'
    print 'Enter 2 For Fill Samples Table'
    print 'Enter 3 For Lable Sample With VT Api'
    print 'Enter 4 For Clear Samples Table'
    print 'Enter 5 For capture Image'
    print 'Enter 6 For Run n-opcode'
    menu = raw_input("Enter Number : ")
    if menu == '1':
        run_whole_process(repo, dump_Method_dir)
    elif menu == '2':
        fill_samples_table(repo, dump_Method_dir)
    elif menu == '3':
        update_samples_label(repo)
    elif menu == '4':
        db.clear_table_samples()
    elif menu == '5':
        fill_samples_table(repo)
        capture_image(repo, dump_Method_dir)
    elif menu == '6':
        n_opcode_progress(repo, dump_Method_dir)
    else:
        print 'Wrong Number'
Example #3
0
def menu_select():
    db = DatabaseHandler()
    repo = raw_input("Directory Address : ")
    if not repo.endswith('/'):
        repo = repo + '/'
    try:
        os.mkdir(repo + 'dec_bank')
    except OSError:
        pass
    dump_method_dir = repo + 'dec_bank'
    while True:
        print '********* DataSet Generator *********'
        print 'Enter 1 For Fill Samples Table'
        print 'Enter 2 For Label Sample With VT Api'
        print 'Enter 3 For Upcode sequence Generator'
        print 'Enter 4 For Clear Samples Table'
        print 'Enter 5 For Exit'
        menu = raw_input("Enter Number : ")
        if menu == '1':
            fill_samples_table(repo)
        elif menu == '2':
            update_samples_label(repo)
        elif menu == '3':
            opcode_sequence_generator(repo, dump_method_dir)
        elif menu == '4':
            db.clear_table_Dataset()
        elif menu == '5':
            break
        else:
            print 'Wrong Number'
Example #4
0
def opcode_sequence_generator5(repo, dumpMethodDir):
    db = DatabaseHandler()
    samples = db.select_sample_all()
    for i in range(2, 11):
        vector = []
        sample_mal = []
        sample_bin = []
        seen = set()
        for item in samples:
            try:
                if item[1].endswith(".apk"):
                    dump_all_method(repo + item[1], dumpMethodDir)
                    opcode_sequence = check_opcode(dumpMethodDir, i)
                    # Add opcode seq to class belong
                    if item[1].startswith('bin_') and item[1].endswith(".apk"):
                        sample_bin.append(opcode_sequence)
                    elif item[1].endswith(".apk"):
                        sample_mal.append(opcode_sequence)
                # Generate a Sequence banck
                    for item in opcode_sequence:
                        if item not in seen:
                            vector.append(item)
                            seen.add(item)
            except Exception as e:
                print e
        write_arff_n_opcode(i, repo + str(i) + '_result.arff', sample_mal,
                            sample_bin)
Example #5
0
def opcode_sequence_generator(repo, dump_method_dir):
    db = DatabaseHandler()
    samples = db.select_sample_all()
    unique_opcode_sequence = []
    sample_mal_opcode_sequence = []
    sample_bin_opcode_sequence = []
    sample_mal_opcode = []
    sample_bin_opcode = []
    sample_bin_name = []
    sample_mal_name = []
    seen = set()
    logger("Start Process : " + str(len(samples)))
    indexer = 1
    for sample_item in samples:
        try:
            logger("++++++ Progress : " + sample_item[1] + " -> " +
                   str(indexer) + "/" + str(len(samples)))
            indexer = indexer + 1
            # Generate Opcode Seq for every sample
            logger("Start Sample : " + sample_item[1])
            if sample_item[1].endswith(".apk"):
                logger("Start dumping")
                dump_all_method(repo + sample_item[1], dump_method_dir)
                logger("End dumping")
                opcode_list = extract_opcode(dump_method_dir)
                opcode_sequence = construct_opcode_sequence(
                    dump_method_dir, opcode_list, 2)
                # Add opcode seq to class belong
                if sample_item[1].startswith(
                        'bin_') and sample_item[1].endswith(".apk"):
                    sample_bin_opcode_sequence.append(opcode_sequence)
                    sample_bin_opcode.append(opcode_list)
                    sample_bin_name.append(sample_item[1])
                elif sample_item[1].endswith(".apk"):
                    sample_mal_opcode_sequence.append(opcode_sequence)
                    sample_mal_opcode.append(opcode_list)
                    sample_mal_name.append(sample_item[1])
                # Generate a Sequence banck
                for opcode_item in opcode_sequence:
                    if opcode_item not in seen:
                        unique_opcode_sequence.append(opcode_item)
                        seen.add(opcode_item)
            logger("End Sample")
        except Exception as e:
            print e

    mal_class = weight_function(sample_mal_opcode_sequence, sample_mal_opcode,
                                unique_opcode_sequence)
    bin_class = weight_function(sample_bin_opcode_sequence, sample_bin_opcode,
                                unique_opcode_sequence)
    for top_count in range(1, 10):
        mal_class_sum = featureSelection(mal_class, unique_opcode_sequence,
                                         top_count * 10)
        bin_class_sum = featureSelection(bin_class, unique_opcode_sequence,
                                         top_count * 10)
        write_arff_csv(repo + 'result' + str(top_count * 10) + '.csv',
                       mal_class, bin_class, mal_class_sum + bin_class_sum)
        write_arff(repo + 'result' + str(top_count * 10) + '.arff', mal_class,
                   bin_class, mal_class_sum + bin_class_sum)
Example #6
0
def opcode_sequence_generator6(repo, dumpMethodDir):
    db = DatabaseHandler()
    samples = db.select_sample_all()
    vector = []
    sample_bin_banck = []
    sample_mal_banck = []
    seen = set()
    for item in samples:
        sample_mal = []
        sample_bin = []
        type = 1
        try:
            if item[1].endswith(".apk"):
                dump_all_method(repo + item[1], dumpMethodDir)
                for i in range(2, 11):
                    opcode_sequence = check_opcode(dumpMethodDir, i)
                    # Add opcode seq to class belong
                    if item[1].startswith('bin_') and item[1].endswith(".apk"):
                        sample_bin.append(opcode_sequence)
                        type = 1
                    elif item[1].endswith(".apk"):
                        sample_mal.append(opcode_sequence)
                        type = 2
                        # Generate a Sequence banck
                if type == 1:
                    sample_bin_banck.append(sample_bin)
                else:
                    sample_mal_banck.append(sample_mal)
                clean_up_folder(dumpMethodDir)
        except Exception as e:
            print e

    for x in range(0, 9):
        sample_mal_1 = []
        sample_bin_1 = []
        for y in range(0, len(sample_bin_banck)):
            sample_bin_1.append(sample_bin_banck[y][x])
        for y in range(0, len(sample_mal_banck)):
            sample_mal_1.append(sample_mal_banck[y][x])
        mal_class_w = func_weight_freq(sample_mal_1)
        bin_class_w = func_weight_freq(sample_bin_1)
        write_arff_n_opcode(x + 2, repo + str(x + 2) + '_result.arff',
                            mal_class_w, bin_class_w)
Example #7
0
def fill_samples_table(repo):
    db = DatabaseHandler()
    db.recreats_table_samples()
    files = get_all_files_in_directory(repo)
    for afile in files:
        try:
            if '.DS_Store' not in afile:
                db.insert_a_sample(afile, '')
        except Exception as e:
            print e
Example #8
0
def fill_samples_table(repo):
    db = DatabaseHandler()
    db.recreats_table_samples()
    files = get_all_files_in_directory(repo)
    for items in files:
        try:
            if str(items).endswith('.apk'):
                db.insert_a_sample(items, '')
        except Exception as e:
            print e
Example #9
0
def opcode_sequence_generator4(repo, dumpMethodDir):
    db = DatabaseHandler()
    samples = db.select_sample_all()
    vector = []
    sample_mal = []
    sample_bin = []
    sample_mal_1 = []
    sample_bin_1 = []
    sample_bin_name = []
    sample_mal_name = []
    seen = set()
    for item in samples:
        try:
            # Generate Opcode Seq for every sample
            if item[1].endswith(".apk"):
                dump_all_method(repo + item[1], dumpMethodDir)
                opcode_sequence = check_opcode(dumpMethodDir, 2)
                opcode_list1 = check_opcode2(dumpMethodDir)
                # Add opcode seq to class belong
                if item[1].startswith('bin_') and item[1].endswith(".apk"):
                    sample_bin.append(opcode_sequence)
                    sample_bin_1.append(opcode_list1)
                    sample_bin_name.append(item[1])
                elif item[1].endswith(".apk"):
                    sample_mal.append(opcode_sequence)
                    sample_mal_1.append(opcode_list1)
                    sample_mal_name.append(item[1])
                # Generate a Sequence banck
                for item in opcode_sequence:
                    if item not in seen:
                        vector.append(item)
                        seen.add(item)
        except Exception as e:
            print e

    mal_class = []
    bin_class = []
    mal_class = func_weight_p_op1_op2(sample_mal, sample_mal_1, vector)
    bin_class = func_weight_p_op1_op2(sample_bin, sample_bin_1, vector)
    write_arff(repo + 'result.arff', mal_class, bin_class)

    output_filename = repo + 'resultLDA.txt'
    simple_result = repo + 'Expenses01.xlsx'
    fp_lda = open(output_filename, "w")
    workbook = xlsxwriter.Workbook(simple_result)
    worksheet = workbook.add_worksheet()

    n_fold = 5
    top_edge = []
    for i in range(2, 250):
        top_edge.append(i)
    row_index = 0
    for top in top_edge:
        total_tp = 0
        total_tn = 0
        total_fp = 0
        total_fn = 0
        total_acc = 0
        total_tpr = 0
        total_fpr = 0
        total_final_set = 0
        name = "************** TOP" + str(top) + " **************"
        fp_lda.write(name)
        fp_lda.write('\n')
        test_count_mal = int(len(mal_class) / n_fold)
        test_count_bin = int(len(bin_class) / n_fold)
        p_bin = 0
        p_mal = 0
        for fold in range(1, n_fold + 1):
            train_mal_class = []
            train_bin_class = []
            test_mal_class = []
            test_bin_class = []
            test_mal_name = []
            test_bin_name = []
            for i in range(0, len(bin_class)):
                if i >= p_bin * test_count_bin and i < p_bin * test_count_bin + test_count_bin:
                    test_bin_class.append(bin_class[i])
                    test_bin_name.append(sample_bin_name[i])
                else:
                    train_bin_class.append(bin_class[i])
            p_bin = p_bin + 1

            for i in range(0, len(mal_class)):
                if i >= p_mal * test_count_mal and i < p_mal * test_count_mal + test_count_mal:
                    test_mal_class.append(mal_class[i])
                    test_mal_name.append(sample_mal_name[i])
                else:
                    train_mal_class.append(mal_class[i])
            p_mal = p_mal + 1

            # calculate MIN mal class for every feature
            MIN_total = {}
            total_len = len(train_mal_class) + len(train_bin_class)
            print "start Calculate Mean Malware Class"
            MIN_mal = {}
            for feature in vector:
                sum_feature = 0
                for item in train_mal_class:
                    if feature in item:
                        sum_feature = item[feature] + sum_feature
                MIN_mal[feature] = sum_feature / len(train_mal_class)
                MIN_total[feature] = sum_feature
            print "start Calculate Mean Bin Class"

            MIN_bin = {}
            for feature in vector:
                sum_feature = 0
                for item in train_bin_class:
                    if feature in item:
                        sum_feature = item[feature] + sum_feature
                MIN_bin[feature] = sum_feature / len(train_bin_class)
                MIN_total[feature] = (MIN_total[feature] +
                                      sum_feature) / total_len
            print "start Calculate SW"

            # Calculate SW
            SW = {}
            for feature in vector:
                sum_feature = 0
                for item in train_mal_class:
                    if feature in item and feature in MIN_mal:
                        X = item[feature] - MIN_mal[feature]
                    elif feature in item:
                        X = item[feature]
                    elif feature in MIN_mal:
                        X = MIN_mal[feature]
                    else:
                        X = 0
                    Y = X * X
                    sum_feature = sum_feature + Y

                for item in train_bin_class:
                    if feature in item and feature in MIN_bin:
                        X = item[feature] - MIN_bin[feature]
                    elif feature in item:
                        X = item[feature]
                    elif feature in MIN_bin:
                        X = MIN_bin[feature]
                    else:
                        X = 0
                    Y = X * X
                    sum_feature = sum_feature + Y

                SW[feature] = sum_feature

            # Calculate SB
            print "start Calculate Mean SB"
            malware_persentage = len(train_mal_class) * 100 / total_len
            binware_persentage = len(train_mal_class) * 100 / total_len
            SB = {}
            for features in vector:
                if feature in MIN_mal and feature in MIN_bin:
                    total_mean = MIN_total[features]
                    SB[features] = (malware_persentage *
                                    (MIN_mal[features] - total_mean) *
                                    (MIN_mal[features] - total_mean)) + (
                                        binware_persentage *
                                        (MIN_bin[features] - total_mean) *
                                        (MIN_bin[features] - total_mean))
                elif feature in MIN_bin:
                    total_mean = MIN_total[features]
                    SB[features] = (
                        malware_persentage * (0 - total_mean) *
                        (0 - total_mean)) + (binware_persentage *
                                             (MIN_bin[features] - total_mean) *
                                             (MIN_bin[features] - total_mean))
                elif feature in MIN_mal:
                    total_mean = MIN_total[features]
                    SB[features] = (malware_persentage *
                                    (MIN_mal[features] - total_mean) *
                                    (MIN_mal[features] - total_mean)) + (
                                        binware_persentage * (0 - total_mean) *
                                        (0 - total_mean))
                else:
                    total_mean = 0
                    SB[features] = (malware_persentage * (0 - total_mean) *
                                    (0 - total_mean)) + (binware_persentage *
                                                         (0 - total_mean) *
                                                         (0 - total_mean))

            # Calculate ST
            print "start Calculate ST"
            ST = {}
            for item in vector:
                if SW[item] != 0:
                    ST[item] = (SB[item]) / SW[item]
                else:
                    ST[item] = 0
            select_top = sorted(ST.iteritems(),
                                key=lambda x: -x[1],
                                reverse=False)[:top]
            final_op_set = []
            opcode_bank = {}
            index_helper_x = 0
            seen = set()
            for key, value in select_top:
                splitter = key.strip().split()
                if splitter[0] not in seen:
                    final_op_set.append(splitter[0])
                    opcode_bank[splitter[0]] = index_helper_x
                    index_helper_x = index_helper_x + 1
                    seen.add(splitter[0])
                if splitter[1] not in seen:
                    final_op_set.append(splitter[1])
                    opcode_bank[splitter[1]] = index_helper_x
                    index_helper_x = index_helper_x + 1
                    seen.add(splitter[1])
            len_train = len(train_bin_class) + len(train_mal_class)
            test_set_mal = np.zeros(
                (len(test_mal_class), len(final_op_set) * len(final_op_set)))
            test_set_bin = np.zeros(
                (len(test_bin_class), len(final_op_set) * len(final_op_set)))
            train_set = np.zeros(
                (len_train, len(final_op_set) * len(final_op_set)))
            train_lable = []
            index_train = 0

            for item in train_mal_class:
                image = np.array([[1.0 for j in range(len(final_op_set))]
                                  for i in range(len(final_op_set))])
                for opc_i in final_op_set:
                    for opc_j in final_op_set:
                        x = opcode_bank[opc_i]
                        y = opcode_bank[opc_j]
                        key = str(str(opc_i) + " " + str(opc_j))
                        if key in item:
                            image[x][y] = item[str(opc_i) + " " + str(opc_j)]
                        else:
                            image[x][y] = 0
                train_set[index_train] = image.flatten()
                train_lable.append(1)
                index_train = index_train + 1

            for item in train_bin_class:
                image = np.array([[1.0 for j in range(len(final_op_set))]
                                  for i in range(len(final_op_set))])
                for opc_i in final_op_set:
                    for opc_j in final_op_set:
                        x = opcode_bank[opc_i]
                        y = opcode_bank[opc_j]
                        key = str(str(opc_i) + " " + str(opc_j))
                        if key in item:
                            image[x][y] = item[str(opc_i) + " " + str(opc_j)]
                        else:
                            image[x][y] = 0
                train_set[index_train] = image.flatten()
                train_lable.append(0)
                index_train = index_train + 1

            index_test = 0
            for item in test_mal_class:
                image = np.array([[1.0 for j in range(len(final_op_set))]
                                  for i in range(len(final_op_set))])

                for opc_i in final_op_set:
                    for opc_j in final_op_set:
                        x = opcode_bank[opc_i]
                        y = opcode_bank[opc_j]
                        key = str(str(opc_i) + " " + str(opc_j))
                        if key in item:
                            image[x][y] = item[str(opc_i) + " " + str(opc_j)]
                        else:
                            image[x][y] = 0
                test_set_mal[index_test] = image.flatten()
                index_test = index_test + 1

            index_test = 0
            for item in test_bin_class:
                image = np.array([[1.0 for j in range(len(final_op_set))]
                                  for i in range(len(final_op_set))])
                for opc_i in final_op_set:
                    for opc_j in final_op_set:
                        x = opcode_bank[opc_i]
                        y = opcode_bank[opc_j]
                        key = str(str(opc_i) + " " + str(opc_j))
                        if key in item:
                            image[x][y] = item[str(opc_i) + " " + str(opc_j)]
                        else:
                            image[x][y] = 0
                test_set_bin[index_test] = image.flatten()
                index_test = index_test + 1

            clf = LinearDiscriminantAnalysis()

            clf.fit(train_set, train_lable)
            tp = 0
            tn = 0
            fp = 0
            fn = 0
            fn_name = []
            fp_name = []
            index_name = 0
            for item in test_set_mal:
                result = clf.predict(item.reshape(1, -1))
                if result == 1:
                    tp = tp + 1
                else:
                    fn = fn + 1
                    fn_name.append(test_mal_name[index_name])
                index_name = index_name + 1
            index_name = 0
            for item in test_set_bin:
                result = clf.predict(item.reshape(1, -1))
                if result == 0:
                    tn = tn + 1
                else:
                    fp = fp + 1
                    fp_name.append(test_bin_name[index_name])
                index_name = index_name + 1
            acc = (tp + tn) / (tp + tn + fp + fn)
            tpr = (tp) / (tp + fn)
            fpr = (fp) / (fp + tn)
            fp_lda.write('\n')
            fp_lda.write('TP : ' + str(tp))
            fp_lda.write('\n')
            fp_lda.write('TN : ' + str(tn))
            fp_lda.write('\n')
            fp_lda.write('FP : ' + str(fp))
            fp_lda.write('\n')
            fp_lda.write('FN : ' + str(fn))
            fp_lda.write('\n')
            fp_lda.write('ACC : ' + str(acc))
            fp_lda.write('\n')
            fp_lda.write('LEN : ' + str(len(final_op_set)))
            fp_lda.write('\n')
            for item in fp_name:
                fp_lda.write('fp_name : ' + str(item))
                fp_lda.write('\n')
            for item in fn_name:
                fp_lda.write('fn_name : ' + str(item))
                fp_lda.write('\n')
            total_tp = total_tp + tp
            total_tn = total_tn + tn
            total_fp = total_fp + fp
            total_fn = total_fn + fn
            total_acc = total_acc + acc
            total_tpr = total_tpr + tpr
            total_fpr = total_fpr + fpr
            total_final_set = len(final_op_set) + total_final_set
        col_index = 0
        worksheet.write(row_index, col_index, total_tp / fold)
        col_index = col_index + 1
        worksheet.write(row_index, col_index, total_fp / fold)
        col_index = col_index + 1
        worksheet.write(row_index, col_index, total_tn / fold)
        col_index = col_index + 1
        worksheet.write(row_index, col_index, total_fn / fold)
        col_index = col_index + 1
        worksheet.write(row_index, col_index, total_tpr / fold)
        col_index = col_index + 1
        worksheet.write(row_index, col_index, total_fpr / fold)
        col_index = col_index + 1
        worksheet.write(row_index, col_index, total_acc / fold)
        col_index = col_index + 1
        worksheet.write(row_index, col_index, top)
        col_index = col_index + 1
        worksheet.write(row_index, col_index, total_final_set / fold)
        col_index = col_index + 1
        row_index = row_index + 1
Example #10
0
def capture_image(repo, dump_method_dir):
    db = DatabaseHandler()
    samples = db.select_sample_all()
    vector = []
    sample = []
    sample_name = []
    sample_1 = []
    seen = set()
    for item in samples:
        try:
            # Generate Opcode Seq for every sample
            dump_all_method(repo + item[1], dump_method_dir)
            opcode_sequence = check_opcode(dump_method_dir, 2)
            opcode_list1 = check_opcode2(dump_method_dir)
            # Add opcode seq to class belong
            if item[1].endswith(".apk"):
                sample.append(opcode_sequence)
                sample_1.append(opcode_list1)
                sample_name.append(item[1])
            for item in opcode_sequence:
                if item not in seen:
                    vector.append(item)
                    seen.add(item)
        except Exception as e:
            print e

    sample_class = []
    sample_class = func_weight_p_op1_op2(sample, sample_1, vector)
    final_op_set = []
    opcode_bank = {}
    index_helper_x = 0
    seen = set()
    for item in sample_class:
        for key, value in item.iteritems():
            splitter = key.strip().split()
            if splitter[0] not in seen:
                final_op_set.append(splitter[0])
                opcode_bank[splitter[0]] = index_helper_x
                index_helper_x = index_helper_x + 1
                seen.add(splitter[0])
            if splitter[1] not in seen:
                final_op_set.append(splitter[1])
                opcode_bank[splitter[1]] = index_helper_x
                index_helper_x = index_helper_x + 1
                seen.add(splitter[1])
    index_name = 0
    for item in sample_class:
        image = np.array([[0.0 for j in range(256)] for i in range(256)])
        for opc_i in final_op_set:
            for opc_j in final_op_set:
                x = opcode_bank[opc_i]
                y = opcode_bank[opc_j]
                key = str(str(opc_i) + " " + str(opc_j))
                if key in item:
                    image[x][y] = item[str(opc_i) + " " + str(opc_j)]
                else:
                    image[x][y] = 0
        rescaled = (255.0 / image.max() * (image - image.min())).astype(
            np.uint8)
        im = Image.fromarray(rescaled)
        im.show()
        im.save(str(sample_name[index_name]) + '.png', 'PNG')
        index_name = index_name + 1
Example #11
0
 def __init__(self):
     self.db = DatabaseHandler()