def renameItemsByMD5( json_path, # 报告文件路径 item_path, # 数据文件路径 ext_name=''): reporter = Reporter() md5s = [] for json_item in tqdm(os.listdir(json_path)): try: report = loadJson(json_path + json_item) md5 = report['md5'] if md5 in md5s: reporter.logWarning(entity=json_item, msg='MD5重复') success_flag = False else: md5s.append(md5) success_flag = True filename = '.'.join(json_item.split('.')[:-1]) os.rename(json_path + json_item, json_path + md5 + '.json') # 重命名json报告 os.rename(item_path + filename + ext_name, item_path + md5 + ext_name) # 重命名数据文件 if success_flag: reporter.logSuccess() except Exception as e: reporter.logError(entity=json_item, msg=str(e)) reporter.report()
def extractApiFromJson(path): reporter = Reporter() for i, item_dir in enumerate(os.listdir(path)): print(i, item_dir) cur_json_path = path + item_dir + '/%s.json' % item_dir new_report = {} new_report['apis'] = [] # 此处假设json文件与文件夹同名 try: report = loadJson(cur_json_path) # 兼容处理后的报告和未处理的报告 if 'target' in report: new_report['name'] = report['target']['file']['name'] else: new_report['name'] = report['name'] # 新版本的report,含有api字段 if 'apis' in report: new_report['apis'] = report['apis'] # 完整的报告中,api位于behavior-processes-calls-api中 else: # 按进程-调用-api的方式逐个收集api调用名称 api_call_seq = [] for process in report['behavior']['processes']: for call in process['calls']: api_call_seq.append(call['api']) new_report['apis'] = api_call_seq reporter.logSuccess() # 对于键错误,说明源文件中有错误,应该进行留空处理 except KeyError as e: # name字段已保存,则api留空 if 'name' in new_report: new_report['apis'] = [] dumpJson(new_report, cur_json_path) # 否则直接不处理 reporter.logError(item_dir, str(e)) # 其他错误不进行处理 except Exception as e: reporter.logError(item_dir, str(e)) reporter.report()
def mappingApiNormalize(json_path, mapping, dump_mapping_path=None, is_class_dir=False): reporter = Reporter() for folder in tqdm(os.listdir(json_path)): items = os.listdir(json_path + folder + '/') if is_class_dir else [folder + '.json'] for item in items: item_path = json_path + folder + '/' + item try: report = loadJson(item_path) for i in range(len(report['apis'])): if report['apis'][i] in mapping: report['apis'][i] = mapping[report['apis'][i]] dumpJson(report, item_path) reporter.logSuccess() except Exception as e: reporter.logError(item, str(e)) if dump_mapping_path is not None: dumpJson(mapping, dump_mapping_path) reporter.report()
def parseAndSampleDataset(scale_report_path, base_path, dst_path, num_per_class, checked=True): scale_report = loadJson(scale_report_path) for family_name in tqdm(scale_report): # 抽样满足数量规模的类 if len(scale_report[family_name]) >= num_per_class: random.seed(magicSeed()) candidates = random.sample(scale_report[family_name], num_per_class) if os.path.exists(dst_path+family_name+'/'): raise RuntimeError("%s 类的文件夹在目标路径中已存在!"%(family_name)) else: os.mkdir(dst_path+family_name+'/') for item in candidates: folder_name,item_name = item.split("/") full_item_name = item_name+'.'+folder_name shutil.copy(base_path+item, dst_path+family_name+'/'+full_item_name) if checked: reporter = Reporter() for folder in os.listdir(dst_path): if len(os.listdir(dst_path+folder+'/')) != num_per_class: reporter.logError(entity=folder, msg="数量不足预期: %d/%d"% (len(os.listdir(dst_path+folder+'/')), num_per_class)) else: reporter.logSuccess() reporter.report()
def removeApiRedundance(json_path, selected_apis=None, class_dir=True): reporter = Reporter() for folder in tqdm(os.listdir(json_path)): if class_dir: items = os.listdir(json_path + folder + '/') else: items = [folder + '.json'] for item in items: item_path = json_path + folder + '/' + item try: report = loadJson(item_path) redun_flag = False redun_api_token = None new_api_seq = [] for api_token in report['apis']: # 只关注选出的那些api # 如果给定的选中API为None代表不进行选择 if selected_apis is None or \ api_token in selected_apis: if api_token != redun_api_token: # 每当遇到新的api时,刷新当前遇到的api,同时重置flag redun_api_token = api_token redun_flag = False else: if not redun_flag: # 如果遇到了一样的api,但是flag没有置位,说明第二次遇到,同时置位flag redun_flag = True else: continue # 如果遇到了一样的api且flag置位,说明已经遇到过两次,则跳过冗余api new_api_seq.append(api_token) # 使用新api序列覆盖原api序列 report['apis'] = new_api_seq dumpJson(report, item_path) reporter.logSuccess() except Exception as e: reporter.logError(folder, str(e)) reporter.report()
def extractAllZipFile(src_dir, dst_dir, psw): reporter = Reporter() if not os.path.exists(dst_dir): os.mkdir(dst_dir) for item in tqdm(os.listdir(src_dir)): try: extractZipFile(src=src_dir + item, dst=dst_dir, psw=psw) reporter.logSuccess() except RuntimeError as e: reporter.logError(entity=item, msg=str(e)) reporter.report()
def convertToNGramSeq( parent_path, window=3, ngram_dict=None, # 统计得到的NGram字典,已排序 ngram_max_num=None, class_dir=False): # 要提取前n个NGram,可从统计函数中获取信息,或者不指定 reporter = Reporter() if ngram_dict is not None and ngram_max_num is not None: valid_ngrams = list(ngram_dict.keys())[:ngram_max_num] else: valid_ngrams = None for folder in tqdm(os.listdir(parent_path)): folder_path = parent_path + folder + '/' if class_dir: items = os.listdir(folder_path) else: items = [folder + '.json'] for item in items: try: ngram_seq = [] report = loadJson(folder_path + item) api_seq = report['apis'] for i in range(len(api_seq) - window): ngram = strlistToStr(api_seq[i:i + window]) # 没有指定要提取的ngram或者当前ngram存在于要提取的ngram中时才会添加 if valid_ngrams is None or ngram in valid_ngrams: ngram_seq.append(ngram) # 写回原文件中 report['apis'] = ngram_seq dumpJson(report, folder_path + item) reporter.logSuccess() except Exception as e: reporter.logError(entity=folder + '/' + item, msg=str(e)) continue reporter.report()
def renameCuckooFolders(json_path): reporter = Reporter() for folder in tqdm(os.listdir(json_path)): try: report = loadJson(json_path+folder+'/report.json') name = report['target']['file']['name'] os.rename(json_path+folder+'/report.json', json_path+folder+'/%s.json'%name) os.rename(json_path+folder, json_path+name) reporter.logSuccess() except Exception as e: reporter.logError(entity=folder, msg=str(e)) continue reporter.report()
def collectJsonFromExistingDataset(json_path, dst_path, is_class_dir=True): reporter = Reporter() for folder in tqdm(os.listdir(json_path)): if is_class_dir: items = os.listdir(json_path + folder + '/') else: items = [folder + '.json'] for item in items: if not os.path.exists(dst_path + item): shutil.copy(json_path + folder + '/' + item, dst_path + item) reporter.logSuccess() else: reporter.logError(entity=folder + '/' + item, msg="Duplicate exists") reporter.report()
def collectJsonByClass( pe_path, json_path, dst_path, report_path, num_per_class, selected_classes, ): reporter = Reporter() warn_errs = loadJson(report_path) def length_filter(x): return x not in warn_errs['warnings'] and x not in warn_errs['errors'] for cls in tqdm(selected_classes): dst_dir = dst_path + cls + '/' if not os.path.exists(dst_dir): os.mkdir(dst_dir) # filter those items not satisfying scale requirement cand_items = os.listdir(pe_path + cls + '/') cand_items = list(filter(length_filter, cand_items)) # for some PE items, there misses the corresponding json item cand_items = list( filter(lambda x: os.path.exists(json_path + x + '/'), cand_items)) cand_items = random.sample(cand_items, num_per_class) for item in cand_items: try: shutil.copy(json_path + item + '/%s.json' % item, dst_dir + '/%s.json' % item) reporter.logSuccess() except Exception as e: reporter.logError('%s/%s' % (cls, item), str(e)) reporter.report()
def filterApiSequence(json_path, api_list, keep_or_filter=True, is_class_dir=True): # 若为True则过滤列表中的API,若为False则保留列表中的API reporter = Reporter() for folder in tqdm(os.listdir(json_path)): if is_class_dir: items = os.listdir(json_path + folder + '/') else: items = [folder + '.json'] for item in items: item_path = json_path + folder + '/' + item try: report = loadJson(item_path) new_api_seq = [] for api_token in report['apis']: # 若过滤,则api不在列表中 # 若保留,则api在列表中 if (api_token in api_list) ^ keep_or_filter: new_api_seq.append(api_token) # 使用新api序列覆盖原api序列 report['apis'] = new_api_seq dumpJson(report, item_path) reporter.logSuccess() except Exception as e: reporter.logError(item, str(e)) reporter.report()
def statNGram( parent_path, window=3, dict_save_path=None, # NGram频率的保存 frequency_stairs=[], # 频率阶梯,必须从小到大排列,统计超过该频率需要的最少NGram个数 class_dir=False): reporter = Reporter() ngram_dict = {} total_cnt = 0 printState('Counting...') for folder in tqdm(os.listdir(parent_path)): folder_path = parent_path + folder + '/' if class_dir: items = os.listdir(folder_path) else: items = [folder + '.json'] for item in items: try: seq = loadJson(folder_path + item)['apis'] for i in range(len(seq) - window): ngram = strlistToStr(seq[i:i + window]) total_cnt += 1 if ngram not in ngram_dict: ngram_dict[ngram] = 1 else: ngram_dict[ngram] += 1 reporter.logSuccess() except Exception as e: reporter.logError(entity=folder, msg=str(e)) continue printState('Processing...') # 按照频率降序排列 ngram_dict = dict( sorted(ngram_dict.items(), key=lambda x: x[1], reverse=True)) # 频率归一化 for k in ngram_dict.keys(): ngram_dict[k] = ngram_dict[k] / total_cnt if dict_save_path is not None: dumpJson(ngram_dict, dict_save_path) # 统计频率分布 f_accum = 0. idx = 0 keys = list(ngram_dict.keys()) max_len = len(keys) for f_stair in frequency_stairs: while f_accum < f_stair and idx < max_len: f_accum += ngram_dict[keys[idx]] idx += 1 printBulletin('%f: %d NGrams' % (f_stair, idx + 1)) printBulletin('Total: %d NGrams' % len(ngram_dict)) reporter.report() return ngram_dict
def apiStat( path, least_length=10, # 最小序列长度 dump_report_path=None, # 保存错误和警告报告信息的路径,JSON格式 dump_apiset_path=None, # 所有API的集合,JSON格式 ratio_stairs=[], # 统计序列长度百分比的阶梯 class_dir=False, plot=False): # 一个文件夹内是单类的所有样本还是单个样本 reporter = Reporter() # 统计api的种类个数 api_set = set() # 统计api的长度 lengths = [] # 统计api长度的最大最小值 min_ = sys.maxsize max_ = -1 for folder in tqdm(os.listdir(path)): if class_dir: items = os.listdir(path + folder + '/') items = list(map(lambda x: '.'.join(x.split('.')[:-1]), items)) # 每个文件都支取其名,不取其扩展名 else: # 如果是单个序列一个文件夹,其名称与文件夹相同 items = [folder] for item in items: try: # 假定json文件与文件夹同名 report = loadJson(path + folder + '/%s.json' % item) length = len(report['apis']) lengths.append(length) for api in report['apis']: api_set.add(api) # 更新最大最小值 min_ = min(length, min_) max_ = max(length, max_) if length == 0: reporter.logError(item, 'api length of 0') elif length < least_length: reporter.logWarning(item, 'api length of %d' % length) else: reporter.logSuccess() except Exception as e: reporter.logError(item, str(e)) printBulletin('Max Length: %d' % max_) printBulletin('Min Length: %d' % min_) printBulletin('API set(%d in total)' % len(api_set)) reporter.report() lengths = np.array(lengths) for length_stair in ratio_stairs: ratio = (lengths < length_stair).sum() / len(lengths) printBulletin('Length within %d: %f' % (length_stair, ratio)) if plot: plt.hist(lengths, bins=1000, normed=True, range=(0, 10000)) plt.show() if dump_report_path is not None: reporter.dump(dump_report_path) if dump_apiset_path is not None: dumpIterable(api_set, 'api_set', dump_apiset_path)
def removeRepeatedSubSeq( json_path, max_sub_seq_len=5, # 待检测的最长子重复序列的最大长度 is_class_dir=False): ############################################## # 以一个锚点r_base_idx开始,以指定的长度r_pat_len移除 # r_seq中的重复子序列 ############################################## def removePattern(r_seq, r_base_idx, r_pat_len): candidate_pat = r_seq[r_base_idx:r_base_idx + r_pat_len] r_idx = r_base_idx + r_pat_len # 起始检测位置从下一个子序列开始 flag = False while r_idx + r_pat_len < len(r_seq): temp = r_seq[r_idx:r_idx + r_pat_len] if temp == candidate_pat: # 移除匹配到的子串 r_seq = r_seq[:r_idx] + r_seq[r_idx + r_pat_len:] flag = True # 如果没有匹配到子串,则将当前下标移动到下一个位置去 else: break return r_seq, flag reporter = Reporter() for folder in tqdm(os.listdir(json_path)): print(folder) if is_class_dir: items = os.listdir(json_path + folder + '/') else: items = [folder + '.json'] for item in items: item_path = json_path + folder + '/' + item try: report = loadJson(item_path) apis = report['apis'] seq_index = 0 while seq_index < len(apis): # print(seq_index) for i in range(1, max_sub_seq_len + 1): apis, flag_ = removePattern(apis, seq_index, i) # 一旦移除了重复子序列,检测的子序列长度应该从1重新开始 if flag_: break # 如果子序列匹配成功,则锚点前进移除的模式长度 if flag_: seq_index += i # 如果子序列匹配失败,则只移动一个长度位置 else: seq_index += 1 # 使用新api序列覆盖原api序列 report['apis'] = apis dumpJson(report, item_path) reporter.logSuccess() except Exception as e: reporter.logError(folder, str(e)) reporter.report()