def cvedetails_crawler(): """ 从cvedetails.com爬exp标记 :return exp_add:tuple_dict, e.g. (cid,label,'cvedetails') """ create_table(db='exp', table='cvedetails', key=['cve_id', 'label'], primary_key='cve_id') so, exist_cid = cve_exists(db='exp', table='cvedetails', key=['cve_id']) so1, all_cid = cve_exists(db='cve', table='nvd', key=['CVE_Items_cve_CVE_data_meta_ID']) add_cid = list(set(all_cid).difference(set(exist_cid))) cve_add = dict() exp_add = dict() count = 0 for cid in add_cid: label = cvedetails_parser(cid) if label == 1: cve_add[cid] = (cid, label) exp_add[cid] = (cid, label, 'cvedetails') print((cid, label)) sql = 'replace INTO cvedetails (cve_id, label) VALUES (?, ?)' so.executemany(sql, cve_add.values()) print( '[+] Searched exp added from remote cve details, it will costs several minute' ) return exp_add
def github_exp_all(): """ :return exist_cid:list, cve id existed in sqlite3 from github.com """ so, exist_cid = cve_exists(db='exp', table='github', key=['cve_id']) print('[+] Searched all exp from local github db') return exist_cid
def seebug_exp_add(): """ 爬取seebug全部ssv数据,返回增量cve exp数据 :return seebug_exp_add:tuple_dict, cve added with exp data in seebug.org, e.g. {'SSV-18407': ('SSV-18407', '2002-05-19', 'psyBNC <= 2.3 Denial of Service Exploit', 'CVE-2002-0741', 1)} } """ create_table(db='exp', table='seebug', key=['ssv', 'date', 'title', 'cid', 'label'], primary_key='ssv') so, exist_sid = cve_exists(db='exp', table='seebug', key=['ssv']) max_page, headers = seebug_max_page() # 对比本地数据和远程第一页数据 seebug_ssv_add = dict() seebug_cve_exp_add = dict() for page in range(1, max_page): print("[+] Crawling seebug vul page:%d" % page) et = 0 vul_url = 'https://www.seebug.org/vuldb/vulnerabilities?page={}'.format( str(page)) ssv_page, exp_page = seebug_page_parser(url=vul_url, headers=headers) # 判断单页和本地数据重复的数量 ssv = list(ssv_page.values()) for i in ssv: if i[0] in exist_sid: et = et + 1 del ssv_page[i[0]] if i[0] in list(exp_page.values()): del exp_page[i[0]] # 判断是否需要插入数据库 if et < 20: seebug_ssv_add = {**seebug_ssv_add, **ssv_page} else: pass seebug_cve_exp_add = {**seebug_cve_exp_add, **exp_page} # 判断是否退出 if et > 0: break else: pass if seebug_ssv_add: sql = 'replace INTO seebug (ssv,date,title,cid,label) VALUES (?, ?, ?, ?, ?)' so.executemany(sql, seebug_ssv_add.values()) else: print("[!] Seebug updated not found") print('[+] Searched exp added from remote seebug.org') print('[+] Add ssv:%s' % seebug_cve_exp_add) return seebug_cve_exp_add
def expdb_exp_add(reparse=False): """ 从exploit-db.com中提取CVE exp label,弥补不足 更新策略:增量覆盖 :return cves:tuple_dict, exploit-db.com exp data with cve number, e.g. {'48587': ('48587', '2020-12712', 'Sander Ubink', 'remote', 'Multiple', '2020-06-15'), '48590': ('48590', '2020-5515', 'BillyV4', 'webapps', 'PHP', '2020-06-16')} """ create_table( db='exp', table='exploitdb', key=['edb_id', 'cve_id', 'author', 'type', 'platform', 'date'], primary_key='edb_id') so, exist_eid = cve_exists(db='exp', table='exploitdb', key=['edb_id']) if reparse: all_eid = expdb_exists() all_eid = [str(i) for i in all_eid] add_eid = list(set(all_eid).difference(set(exist_eid))) else: rss_add = rss_xml() rss_eid = list(rss_add.keys()) add_eid = list(set(rss_eid).difference(set(exist_eid))) print("[+] Add eid:%s" % add_eid) exps = dict() cves = dict() for eid in add_eid: try: exp_value, cve_value = expdb_parser(eid) if exp_value: exps[eid] = tuple(exp_value.values()) else: pass if cve_value: cves[eid] = tuple(cve_value.values()) else: pass except Exception as e: print("[!] DOWNLOAD ERROR %s error:%s" % (eid, repr(e))) if exps: sql = 'replace INTO exploitdb (edb_id, cve_id, author, type, platform, date) VALUES (?, ?, ?, ?, ?, ?)' so.executemany(sql, exps.values()) so.close() else: print("[!] exploit-db updated not found") print('[+] Searched exp added from remote exploit-db') return cves
def github_exp_add(per_page=100): """ 根据(多组)关键词搜索github上cve和exp标记 更新策略:支持部分存量、增量更新 :return cve_add:tuple_dict, cve id added in github.com """ # 本地和远程对比->差集 create_table(db='exp', table='github', key=['cve_id', 'exp_publishedtime', 'label'], primary_key='cve_id') ## 选择sqlite3 而不是json存储的原因在易于扩展和检索 so, exist_cid = cve_exists(db='exp', table='github', key=['cve_id']) conf = configparser.ConfigParser() conf.read('conf/info.conf') local_count = conf.get('CVE_Label', 'total_count') key_name = conf.get('CVE_Label', 'search_key') api_token = conf.get('CVE_Label', 'api_token') gh = GitHub(api_token=api_token) total_count = max_total_count(gh=gh, kname=key_name, per_page=per_page) cve_add = dict() if total_count: print("[+] Checked, Got github res total count:%s" % total_count) add_count = total_count - int( local_count) # todo: if add_count<per_page if add_count > 1000: add_count = 1000 print("[+] local res:%s,total res:%s,add res:%s" % (local_count, total_count, add_count)) # 补全差集 page_count = math.ceil(add_count / per_page) if page_count > 0: for page in range(1, page_count + 1): t, c = single_request(gh=gh, kname=key_name, per_page=per_page, page=page) if t and c: cl = list(c.values()) for i in cl: if i[0] in exist_cid: del c[i[0]] else: print("[!] Error in call_to_the_api") if c: #print("[+] Github CVE Added:%s" %c) cve_add = {**cve_add, **c} else: print("[!] Github updated not found ") # 插入sqlite3 if page_count > 0: sql = 'replace INTO github (cve_id,exp_publishedtime,label) VALUES (?, ?, ?)' so.executemany(sql, cve_add.values()) conf.set('CVE_Label', 'total_count', str(total_count)) with open('conf/info.conf', 'w') as configfile: conf.write(configfile) else: pass else: print("[Error] Failed to get github total count") print('[+] Searched exp added from remote github.com') return cve_add
def exp_model(delta=0): """ exp训练及预测模型 :return exp_proba: list or [] """ exp_proba = exp_proba2 = list() # 训练 so, cve = cve_query(db='cve', table='nvd', key=['*']) cve_df = pd.DataFrame(cve, columns=cve_tags) x = cve_df['CVE_Items_cve_description_description_data_value'].astype( 'str') y = cve_df['CVE_EXP_label'].astype('int') nlp = wordindex(char_level=False) fx, fy = nlp.fit_transform(x, y) train_x, valid_x, train_y, valid_y = train_test_split(fx, fy, random_state=2019, test_size=0.3) model = textcnn(input_type='wordindex', max_len=nlp.max_length, input_dim=nlp.input_dim, output_dim=16, class_num=1) model.fit(train_x, train_y, validation_data=(valid_x, valid_y), epochs=1, batch_size=128) # 测试:预测当天新增CVE modified_time = time_delta(delta=delta, format="%Y-%m-%d") so, cve = cve_query_where( db='cve', table='nvd', key=['*'], where='CVE_Items_publishedDate like "%{}%"'.format(modified_time)) if cve: cve_df = pd.DataFrame(cve, columns=cve_tags) x = cve_df['CVE_Items_cve_description_description_data_value'].astype( 'str') fx = nlp.transform(x) model.summary() pre = model.predict(fx) pre = pd.DataFrame(pre) exp_proba = pd.concat([ cve_df[[ 'CVE_Items_cve_CVE_data_meta_ID', 'CVE_Items_cve_description_description_data_value', 'CVE_Items_publishedDate' ]], pre ], axis=1) exp_proba.columns = ['CVE_ID', 'Description', 'PubDate', 'EXP_Proba'] exp_proba = exp_proba.sort_values(by='EXP_Proba', ascending=False) exp_proba = exp_proba.values.tolist() else: print('[INFO] No CVE Today') # 测试:预测本月新增CVE time = time_delta(format="%Y-%m") so, cve = cve_query_where( db='cve', table='nvd', key=['*'], where='CVE_Items_publishedDate like "%{}%"'.format(time)) if cve: cve_df = pd.DataFrame(cve, columns=cve_tags) x = cve_df['CVE_Items_cve_description_description_data_value'].astype( 'str') fx = nlp.transform(x) pre = model.predict(fx) pre = pd.DataFrame(pre) exp_proba2 = pd.concat([ cve_df[[ 'CVE_Items_cve_CVE_data_meta_ID', 'CVE_Items_cve_description_description_data_value', 'CVE_Items_publishedDate' ]], pre ], axis=1) exp_proba2.columns = ['CVE_ID', 'Description', 'PubDate', 'EXP_Proba'] exp_proba2 = exp_proba2.sort_values(by='EXP_Proba', ascending=False) so, exist_cid = cve_exists(db='exp', table='exps', key=['cve_id']) exp_proba2['Ground_Truth'] = exp_proba2.apply( lambda x: 1 if x['CVE_ID'] in exist_cid else 'None', axis=1) exp_proba2 = exp_proba2.values.tolist() else: print('[INFO] No CVE Month') return exp_proba, exp_proba2
def exp_model(epoch, delta=0): """ exp训练及预测模型 :param todo: 当天当次新增的CVE :return exp_proba: list or [] """ exp_proba = exp_proba2 = list() so, cve = cve_query(db='cve', table='nvd', key=['*']) cve_df = pd.DataFrame(cve, columns=cve_tags) cve_csv = cve_df[[ 'CVE_Items_cve_CVE_data_meta_ID', 'CVE_Items_cve_description_description_data_value', 'CVE_Items_publishedDate', 'CVE_EXP_label' ]] cve_csv.to_csv('CVE_EXP_2020.csv', index=0) # 抑制模型衰减:每月重训练一次 month_day = time_delta(delta=0, format='%Y-%m-%d') if int(month_day.split('-')[2]) == 404: #ecs内存不足 print('[+] Retrain model') x = cve_df['CVE_Items_cve_description_description_data_value'].astype( 'str') y = cve_df['CVE_EXP_label'].astype('int') nlp = wordindex(char_level=False) fx, fy = nlp.fit_transform(x, y) joblib.dump(nlp, 'data/model/nlp.h5') train_x, valid_x, train_y, valid_y = train_test_split( fx, fy, random_state=2019, test_size=0.3) model = textcnn(input_type='wordindex', max_len=nlp.max_length, input_dim=nlp.input_dim, output_dim=16, class_num=1) model.fit(train_x, train_y, validation_data=(valid_x, valid_y), epochs=1, batch_size=128) joblib.dump(model, 'data/model/textcnn.h5') print('[+] Load predict model') nlp = joblib.load('data/model/nlp.h5') model = joblib.load('data/model/textcnn.h5') # 测试:预测当天当次新增CVE if epoch: cve_epoch = pd.DataFrame( list(epoch.values()), columns=[ 'CVE_Items_cve_CVE_data_meta_ID', 'CVE_Items_cve_description_description_data_value', 'CVE_Items_publishedDate', 'CVE_Items_lastModifiedDate', 'Source' ]) print(cve_epoch) x = cve_epoch[ 'CVE_Items_cve_description_description_data_value'].astype('str') fx = nlp.transform(x) #model.summary() pre = model.predict(fx) pre = pd.DataFrame(pre) epoch_exp_proba = pd.concat([ cve_epoch[[ 'CVE_Items_cve_CVE_data_meta_ID', 'CVE_Items_cve_description_description_data_value', 'CVE_Items_publishedDate' ]], pre ], axis=1) epoch_exp_proba.columns = [ 'CVE_ID', 'Description', 'PubDate', 'EXP_Proba' ] epoch_exp_proba = epoch_exp_proba.sort_values(by='EXP_Proba', ascending=False) epoch_exp_proba = epoch_exp_proba.values.tolist() else: epoch_exp_proba = None print('[INFO] No CVE Today Epoch') # 测试:预测当天新增CVE cve = [] for d in delta: modified_time = time_delta(delta=d, format="%Y-%m-%d") so, tmp = cve_query_where( db='cve', table='nvd', key=['*'], where='CVE_Items_publishedDate like "%{}%"'.format(modified_time)) cve = cve + tmp if cve: cve_df = pd.DataFrame(cve, columns=cve_tags) x = cve_df['CVE_Items_cve_description_description_data_value'].astype( 'str') fx = nlp.transform(x) #model.summary() pre = model.predict(fx) pre = pd.DataFrame(pre) day_exp_proba = pd.concat([ cve_df[[ 'CVE_Items_cve_CVE_data_meta_ID', 'CVE_Items_cve_description_description_data_value', 'CVE_Items_publishedDate' ]], pre ], axis=1) day_exp_proba.columns = [ 'CVE_ID', 'Description', 'PubDate', 'EXP_Proba' ] day_exp_proba = day_exp_proba.sort_values(by='EXP_Proba', ascending=False) day_exp_proba = day_exp_proba.values.tolist() else: day_exp_proba = None print('[INFO] No CVE Today') # 测试:预测本月新增CVE time = time_delta(format="%Y-%m") so, cve = cve_query_where( db='cve', table='nvd', key=['*'], where='CVE_Items_publishedDate like "%{}%"'.format(time)) if cve: cve_df = pd.DataFrame(cve, columns=cve_tags) x = cve_df['CVE_Items_cve_description_description_data_value'].astype( 'str') fx = nlp.transform(x) pre = model.predict(fx) pre = pd.DataFrame(pre) month_exp_proba = pd.concat([ cve_df[[ 'CVE_Items_cve_CVE_data_meta_ID', 'CVE_Items_cve_description_description_data_value', 'CVE_Items_publishedDate' ]], pre ], axis=1) month_exp_proba.columns = [ 'CVE_ID', 'Description', 'PubDate', 'EXP_Proba' ] month_exp_proba = month_exp_proba.sort_values(by='EXP_Proba', ascending=False) so, exist_cid = cve_exists(db='exp', table='exps', key=['cve_id']) month_exp_proba['Ground_Truth'] = month_exp_proba.apply( lambda x: 1 if x['CVE_ID'] in exist_cid else 'None', axis=1) month_exp_proba = month_exp_proba.values.tolist() else: month_exp_proba = None print('[INFO] No CVE Month') with open(path('../data/log', 'cveflow.log'), 'a+') as f: f.write('[Done] CVE EXP Prediction') return epoch_exp_proba, day_exp_proba, month_exp_proba