Esempio n. 1
0
def renameItemsByMD5(
        json_path,  # 报告文件路径
        item_path,  # 数据文件路径
        ext_name=''):
    reporter = Reporter()

    md5s = []

    for json_item in tqdm(os.listdir(json_path)):
        try:
            report = loadJson(json_path + json_item)
            md5 = report['md5']

            if md5 in md5s:
                reporter.logWarning(entity=json_item, msg='MD5重复')
                success_flag = False

            else:
                md5s.append(md5)
                success_flag = True

            filename = '.'.join(json_item.split('.')[:-1])

            os.rename(json_path + json_item,
                      json_path + md5 + '.json')  # 重命名json报告
            os.rename(item_path + filename + ext_name,
                      item_path + md5 + ext_name)  # 重命名数据文件

            if success_flag:
                reporter.logSuccess()
        except Exception as e:
            reporter.logError(entity=json_item, msg=str(e))

    reporter.report()
Esempio n. 2
0
def extractApiFromJson(path):

    reporter = Reporter()

    for i, item_dir in enumerate(os.listdir(path)):
        print(i, item_dir)

        cur_json_path = path + item_dir + '/%s.json' % item_dir

        new_report = {}
        new_report['apis'] = []

        # 此处假设json文件与文件夹同名
        try:
            report = loadJson(cur_json_path)

            # 兼容处理后的报告和未处理的报告
            if 'target' in report:
                new_report['name'] = report['target']['file']['name']
            else:
                new_report['name'] = report['name']

            # 新版本的report,含有api字段
            if 'apis' in report:
                new_report['apis'] = report['apis']

            # 完整的报告中,api位于behavior-processes-calls-api中
            else:
                # 按进程-调用-api的方式逐个收集api调用名称
                api_call_seq = []
                for process in report['behavior']['processes']:
                    for call in process['calls']:
                        api_call_seq.append(call['api'])

                new_report['apis'] = api_call_seq

            reporter.logSuccess()

        # 对于键错误,说明源文件中有错误,应该进行留空处理
        except KeyError as e:
            # name字段已保存,则api留空
            if 'name' in new_report:
                new_report['apis'] = []
                dumpJson(new_report, cur_json_path)

            # 否则直接不处理
            reporter.logError(item_dir, str(e))

        # 其他错误不进行处理
        except Exception as e:
            reporter.logError(item_dir, str(e))

    reporter.report()
Esempio n. 3
0
def mappingApiNormalize(json_path,
                        mapping,
                        dump_mapping_path=None,
                        is_class_dir=False):
    reporter = Reporter()

    for folder in tqdm(os.listdir(json_path)):

        items = os.listdir(json_path + folder +
                           '/') if is_class_dir else [folder + '.json']

        for item in items:
            item_path = json_path + folder + '/' + item
            try:
                report = loadJson(item_path)

                for i in range(len(report['apis'])):
                    if report['apis'][i] in mapping:
                        report['apis'][i] = mapping[report['apis'][i]]

                dumpJson(report, item_path)

                reporter.logSuccess()

            except Exception as e:
                reporter.logError(item, str(e))

    if dump_mapping_path is not None:
        dumpJson(mapping, dump_mapping_path)

    reporter.report()
Esempio n. 4
0
def parseAndSampleDataset(scale_report_path,
                          base_path,
                          dst_path,
                          num_per_class,
                          checked=True):

    scale_report = loadJson(scale_report_path)

    for family_name in tqdm(scale_report):
        # 抽样满足数量规模的类
        if len(scale_report[family_name]) >= num_per_class:
            random.seed(magicSeed())
            candidates = random.sample(scale_report[family_name], num_per_class)

            if os.path.exists(dst_path+family_name+'/'):
                raise RuntimeError("%s 类的文件夹在目标路径中已存在!"%(family_name))
            else:
                os.mkdir(dst_path+family_name+'/')

            for item in candidates:
                folder_name,item_name = item.split("/")
                full_item_name = item_name+'.'+folder_name
                shutil.copy(base_path+item, dst_path+family_name+'/'+full_item_name)

    if checked:
        reporter = Reporter()
        for folder in os.listdir(dst_path):
            if len(os.listdir(dst_path+folder+'/')) != num_per_class:
                reporter.logError(entity=folder, msg="数量不足预期: %d/%d"%
                                                     (len(os.listdir(dst_path+folder+'/')), num_per_class))
            else:
                reporter.logSuccess()
        reporter.report()
Esempio n. 5
0
def removeApiRedundance(json_path, selected_apis=None, class_dir=True):

    reporter = Reporter()

    for folder in tqdm(os.listdir(json_path)):

        if class_dir:
            items = os.listdir(json_path + folder + '/')
        else:
            items = [folder + '.json']

        for item in items:

            item_path = json_path + folder + '/' + item

            try:
                report = loadJson(item_path)

                redun_flag = False
                redun_api_token = None

                new_api_seq = []

                for api_token in report['apis']:
                    # 只关注选出的那些api
                    # 如果给定的选中API为None代表不进行选择
                    if selected_apis is None or \
                        api_token in selected_apis:
                        if api_token != redun_api_token:  # 每当遇到新的api时,刷新当前遇到的api,同时重置flag
                            redun_api_token = api_token
                            redun_flag = False
                        else:
                            if not redun_flag:  # 如果遇到了一样的api,但是flag没有置位,说明第二次遇到,同时置位flag
                                redun_flag = True
                            else:
                                continue  # 如果遇到了一样的api且flag置位,说明已经遇到过两次,则跳过冗余api

                        new_api_seq.append(api_token)

                # 使用新api序列覆盖原api序列
                report['apis'] = new_api_seq
                dumpJson(report, item_path)

                reporter.logSuccess()

            except Exception as e:
                reporter.logError(folder, str(e))

    reporter.report()
Esempio n. 6
0
def extractAllZipFile(src_dir, dst_dir, psw):
    reporter = Reporter()

    if not os.path.exists(dst_dir):
        os.mkdir(dst_dir)

    for item in tqdm(os.listdir(src_dir)):
        try:
            extractZipFile(src=src_dir + item, dst=dst_dir, psw=psw)
            reporter.logSuccess()

        except RuntimeError as e:
            reporter.logError(entity=item, msg=str(e))

    reporter.report()
Esempio n. 7
0
def convertToNGramSeq(
        parent_path,
        window=3,
        ngram_dict=None,  # 统计得到的NGram字典,已排序
        ngram_max_num=None,
        class_dir=False):  # 要提取前n个NGram,可从统计函数中获取信息,或者不指定

    reporter = Reporter()

    if ngram_dict is not None and ngram_max_num is not None:
        valid_ngrams = list(ngram_dict.keys())[:ngram_max_num]
    else:
        valid_ngrams = None

    for folder in tqdm(os.listdir(parent_path)):
        folder_path = parent_path + folder + '/'

        if class_dir:
            items = os.listdir(folder_path)
        else:
            items = [folder + '.json']

        for item in items:
            try:
                ngram_seq = []
                report = loadJson(folder_path + item)
                api_seq = report['apis']

                for i in range(len(api_seq) - window):
                    ngram = strlistToStr(api_seq[i:i + window])

                    # 没有指定要提取的ngram或者当前ngram存在于要提取的ngram中时才会添加
                    if valid_ngrams is None or ngram in valid_ngrams:
                        ngram_seq.append(ngram)

                # 写回原文件中
                report['apis'] = ngram_seq
                dumpJson(report, folder_path + item)

                reporter.logSuccess()

            except Exception as e:
                reporter.logError(entity=folder + '/' + item, msg=str(e))
                continue

    reporter.report()
Esempio n. 8
0
def renameCuckooFolders(json_path):
    reporter = Reporter()

    for folder in tqdm(os.listdir(json_path)):
        try:
            report = loadJson(json_path+folder+'/report.json')
            name = report['target']['file']['name']

            os.rename(json_path+folder+'/report.json', json_path+folder+'/%s.json'%name)
            os.rename(json_path+folder, json_path+name)

            reporter.logSuccess()

        except Exception as e:
            reporter.logError(entity=folder, msg=str(e))
            continue

    reporter.report()
Esempio n. 9
0
def collectJsonFromExistingDataset(json_path, dst_path, is_class_dir=True):

    reporter = Reporter()

    for folder in tqdm(os.listdir(json_path)):
        if is_class_dir:
            items = os.listdir(json_path + folder + '/')
        else:
            items = [folder + '.json']

        for item in items:
            if not os.path.exists(dst_path + item):
                shutil.copy(json_path + folder + '/' + item, dst_path + item)
                reporter.logSuccess()
            else:
                reporter.logError(entity=folder + '/' + item,
                                  msg="Duplicate exists")

    reporter.report()
Esempio n. 10
0
def collectJsonByClass(
    pe_path,
    json_path,
    dst_path,
    report_path,
    num_per_class,
    selected_classes,
):
    reporter = Reporter()

    warn_errs = loadJson(report_path)

    def length_filter(x):
        return x not in warn_errs['warnings'] and x not in warn_errs['errors']

    for cls in tqdm(selected_classes):
        dst_dir = dst_path + cls + '/'

        if not os.path.exists(dst_dir):
            os.mkdir(dst_dir)

        # filter those items not satisfying scale requirement
        cand_items = os.listdir(pe_path + cls + '/')
        cand_items = list(filter(length_filter, cand_items))

        # for some PE items, there misses the corresponding json item
        cand_items = list(
            filter(lambda x: os.path.exists(json_path + x + '/'), cand_items))

        cand_items = random.sample(cand_items, num_per_class)

        for item in cand_items:
            try:
                shutil.copy(json_path + item + '/%s.json' % item,
                            dst_dir + '/%s.json' % item)

                reporter.logSuccess()

            except Exception as e:
                reporter.logError('%s/%s' % (cls, item), str(e))

    reporter.report()
Esempio n. 11
0
def filterApiSequence(json_path,
                      api_list,
                      keep_or_filter=True,
                      is_class_dir=True):  # 若为True则过滤列表中的API,若为False则保留列表中的API

    reporter = Reporter()

    for folder in tqdm(os.listdir(json_path)):
        if is_class_dir:
            items = os.listdir(json_path + folder + '/')
        else:
            items = [folder + '.json']

        for item in items:

            item_path = json_path + folder + '/' + item

            try:
                report = loadJson(item_path)

                new_api_seq = []

                for api_token in report['apis']:
                    # 若过滤,则api不在列表中
                    # 若保留,则api在列表中
                    if (api_token in api_list) ^ keep_or_filter:
                        new_api_seq.append(api_token)

                # 使用新api序列覆盖原api序列
                report['apis'] = new_api_seq
                dumpJson(report, item_path)

                reporter.logSuccess()

            except Exception as e:
                reporter.logError(item, str(e))

    reporter.report()
Esempio n. 12
0
def statNGram(
        parent_path,
        window=3,
        dict_save_path=None,  # NGram频率的保存
        frequency_stairs=[],  # 频率阶梯,必须从小到大排列,统计超过该频率需要的最少NGram个数
        class_dir=False):

    reporter = Reporter()

    ngram_dict = {}
    total_cnt = 0

    printState('Counting...')
    for folder in tqdm(os.listdir(parent_path)):
        folder_path = parent_path + folder + '/'

        if class_dir:
            items = os.listdir(folder_path)
        else:
            items = [folder + '.json']

        for item in items:
            try:
                seq = loadJson(folder_path + item)['apis']

                for i in range(len(seq) - window):
                    ngram = strlistToStr(seq[i:i + window])

                    total_cnt += 1
                    if ngram not in ngram_dict:
                        ngram_dict[ngram] = 1
                    else:
                        ngram_dict[ngram] += 1

                reporter.logSuccess()

            except Exception as e:
                reporter.logError(entity=folder, msg=str(e))
                continue

    printState('Processing...')

    # 按照频率降序排列
    ngram_dict = dict(
        sorted(ngram_dict.items(), key=lambda x: x[1], reverse=True))

    # 频率归一化
    for k in ngram_dict.keys():
        ngram_dict[k] = ngram_dict[k] / total_cnt

    if dict_save_path is not None:
        dumpJson(ngram_dict, dict_save_path)

    # 统计频率分布
    f_accum = 0.
    idx = 0
    keys = list(ngram_dict.keys())
    max_len = len(keys)
    for f_stair in frequency_stairs:
        while f_accum < f_stair and idx < max_len:
            f_accum += ngram_dict[keys[idx]]
            idx += 1
        printBulletin('%f:   %d NGrams' % (f_stair, idx + 1))

    printBulletin('Total: %d NGrams' % len(ngram_dict))

    reporter.report()
    return ngram_dict
Esempio n. 13
0
def apiStat(
        path,
        least_length=10,  # 最小序列长度
        dump_report_path=None,  # 保存错误和警告报告信息的路径,JSON格式
        dump_apiset_path=None,  # 所有API的集合,JSON格式
        ratio_stairs=[],  # 统计序列长度百分比的阶梯
        class_dir=False,
        plot=False):  # 一个文件夹内是单类的所有样本还是单个样本

    reporter = Reporter()

    # 统计api的种类个数
    api_set = set()

    # 统计api的长度
    lengths = []

    # 统计api长度的最大最小值
    min_ = sys.maxsize
    max_ = -1

    for folder in tqdm(os.listdir(path)):

        if class_dir:
            items = os.listdir(path + folder + '/')
            items = list(map(lambda x: '.'.join(x.split('.')[:-1]),
                             items))  # 每个文件都支取其名,不取其扩展名

        else:  # 如果是单个序列一个文件夹,其名称与文件夹相同
            items = [folder]

        for item in items:
            try:
                # 假定json文件与文件夹同名
                report = loadJson(path + folder + '/%s.json' % item)

                length = len(report['apis'])
                lengths.append(length)

                for api in report['apis']:
                    api_set.add(api)

                # 更新最大最小值
                min_ = min(length, min_)
                max_ = max(length, max_)

                if length == 0:
                    reporter.logError(item, 'api length of 0')
                elif length < least_length:
                    reporter.logWarning(item, 'api length of %d' % length)
                else:
                    reporter.logSuccess()

            except Exception as e:
                reporter.logError(item, str(e))

    printBulletin('Max Length: %d' % max_)
    printBulletin('Min Length: %d' % min_)
    printBulletin('API set(%d in total)' % len(api_set))

    reporter.report()

    lengths = np.array(lengths)

    for length_stair in ratio_stairs:
        ratio = (lengths < length_stair).sum() / len(lengths)
        printBulletin('Length within %d: %f' % (length_stair, ratio))

    if plot:
        plt.hist(lengths, bins=1000, normed=True, range=(0, 10000))
        plt.show()

    if dump_report_path is not None:
        reporter.dump(dump_report_path)

    if dump_apiset_path is not None:
        dumpIterable(api_set, 'api_set', dump_apiset_path)
Esempio n. 14
0
def removeRepeatedSubSeq(
        json_path,
        max_sub_seq_len=5,  # 待检测的最长子重复序列的最大长度
        is_class_dir=False):

    ##############################################
    # 以一个锚点r_base_idx开始,以指定的长度r_pat_len移除
    # r_seq中的重复子序列
    ##############################################
    def removePattern(r_seq, r_base_idx, r_pat_len):
        candidate_pat = r_seq[r_base_idx:r_base_idx + r_pat_len]
        r_idx = r_base_idx + r_pat_len  # 起始检测位置从下一个子序列开始
        flag = False
        while r_idx + r_pat_len < len(r_seq):
            temp = r_seq[r_idx:r_idx + r_pat_len]
            if temp == candidate_pat:
                # 移除匹配到的子串
                r_seq = r_seq[:r_idx] + r_seq[r_idx + r_pat_len:]
                flag = True
            # 如果没有匹配到子串,则将当前下标移动到下一个位置去
            else:
                break

        return r_seq, flag

    reporter = Reporter()

    for folder in tqdm(os.listdir(json_path)):

        print(folder)

        if is_class_dir:
            items = os.listdir(json_path + folder + '/')
        else:
            items = [folder + '.json']

        for item in items:

            item_path = json_path + folder + '/' + item

            try:
                report = loadJson(item_path)
                apis = report['apis']

                seq_index = 0

                while seq_index < len(apis):
                    # print(seq_index)
                    for i in range(1, max_sub_seq_len + 1):
                        apis, flag_ = removePattern(apis, seq_index, i)
                        # 一旦移除了重复子序列,检测的子序列长度应该从1重新开始
                        if flag_:
                            break

                    # 如果子序列匹配成功,则锚点前进移除的模式长度
                    if flag_:
                        seq_index += i
                    # 如果子序列匹配失败,则只移动一个长度位置
                    else:
                        seq_index += 1

                # 使用新api序列覆盖原api序列
                report['apis'] = apis
                dumpJson(report, item_path)

                reporter.logSuccess()

            except Exception as e:
                reporter.logError(folder, str(e))

    reporter.report()