Beispiel #1
0
def cvedetails_crawler():
    """
    从cvedetails.com爬exp标记
    :return exp_add:tuple_dict, e.g. (cid,label,'cvedetails')
    """
    create_table(db='exp',
                 table='cvedetails',
                 key=['cve_id', 'label'],
                 primary_key='cve_id')
    so, exist_cid = cve_exists(db='exp', table='cvedetails', key=['cve_id'])
    so1, all_cid = cve_exists(db='cve',
                              table='nvd',
                              key=['CVE_Items_cve_CVE_data_meta_ID'])
    add_cid = list(set(all_cid).difference(set(exist_cid)))

    cve_add = dict()
    exp_add = dict()
    count = 0
    for cid in add_cid:
        label = cvedetails_parser(cid)
        if label == 1:
            cve_add[cid] = (cid, label)
            exp_add[cid] = (cid, label, 'cvedetails')
        print((cid, label))
    sql = 'replace INTO cvedetails (cve_id, label) VALUES (?, ?)'
    so.executemany(sql, cve_add.values())

    print(
        '[+] Searched exp added from remote cve details, it will costs several minute'
    )
    return exp_add
Beispiel #2
0
def github_exp_all():
    """
    :return exist_cid:list, cve id existed in sqlite3 from github.com
    """
    so, exist_cid = cve_exists(db='exp', table='github', key=['cve_id'])
    print('[+] Searched all exp from local github db')
    return exist_cid
Beispiel #3
0
def seebug_exp_add():
    """
    爬取seebug全部ssv数据,返回增量cve exp数据
    :return seebug_exp_add:tuple_dict, cve added with exp data in seebug.org, e.g. {'SSV-18407': ('SSV-18407', '2002-05-19', 'psyBNC <= 2.3 Denial of Service Exploit', 'CVE-2002-0741', 1)}
    }
    """
    create_table(db='exp',
                 table='seebug',
                 key=['ssv', 'date', 'title', 'cid', 'label'],
                 primary_key='ssv')
    so, exist_sid = cve_exists(db='exp', table='seebug', key=['ssv'])
    max_page, headers = seebug_max_page()
    # 对比本地数据和远程第一页数据
    seebug_ssv_add = dict()
    seebug_cve_exp_add = dict()
    for page in range(1, max_page):
        print("[+] Crawling seebug vul page:%d" % page)
        et = 0
        vul_url = 'https://www.seebug.org/vuldb/vulnerabilities?page={}'.format(
            str(page))
        ssv_page, exp_page = seebug_page_parser(url=vul_url, headers=headers)
        # 判断单页和本地数据重复的数量
        ssv = list(ssv_page.values())
        for i in ssv:
            if i[0] in exist_sid:
                et = et + 1
                del ssv_page[i[0]]
                if i[0] in list(exp_page.values()):
                    del exp_page[i[0]]

        # 判断是否需要插入数据库
        if et < 20:
            seebug_ssv_add = {**seebug_ssv_add, **ssv_page}
        else:
            pass
        seebug_cve_exp_add = {**seebug_cve_exp_add, **exp_page}

        # 判断是否退出
        if et > 0:
            break
        else:
            pass
    if seebug_ssv_add:
        sql = 'replace INTO seebug (ssv,date,title,cid,label) VALUES (?, ?, ?, ?, ?)'
        so.executemany(sql, seebug_ssv_add.values())
    else:
        print("[!] Seebug updated not found")
    print('[+] Searched exp added from remote seebug.org')
    print('[+] Add ssv:%s' % seebug_cve_exp_add)
    return seebug_cve_exp_add
def expdb_exp_add(reparse=False):
    """
    从exploit-db.com中提取CVE exp label,弥补不足
    更新策略:增量覆盖
    :return cves:tuple_dict, exploit-db.com exp data with cve number, e.g. {'48587': ('48587', '2020-12712', 'Sander Ubink', 'remote', 'Multiple', '2020-06-15'), '48590': ('48590', '2020-5515', 'BillyV4', 'webapps', 'PHP', '2020-06-16')}
    """
    create_table(
        db='exp',
        table='exploitdb',
        key=['edb_id', 'cve_id', 'author', 'type', 'platform', 'date'],
        primary_key='edb_id')
    so, exist_eid = cve_exists(db='exp', table='exploitdb', key=['edb_id'])
    if reparse:
        all_eid = expdb_exists()
        all_eid = [str(i) for i in all_eid]
        add_eid = list(set(all_eid).difference(set(exist_eid)))
    else:
        rss_add = rss_xml()
        rss_eid = list(rss_add.keys())
        add_eid = list(set(rss_eid).difference(set(exist_eid)))

    print("[+] Add eid:%s" % add_eid)

    exps = dict()
    cves = dict()
    for eid in add_eid:
        try:
            exp_value, cve_value = expdb_parser(eid)
            if exp_value:
                exps[eid] = tuple(exp_value.values())
            else:
                pass
            if cve_value:
                cves[eid] = tuple(cve_value.values())
            else:
                pass
        except Exception as e:
            print("[!] DOWNLOAD ERROR %s error:%s" % (eid, repr(e)))

    if exps:
        sql = 'replace INTO exploitdb (edb_id, cve_id, author, type, platform, date) VALUES (?, ?, ?, ?, ?, ?)'
        so.executemany(sql, exps.values())
        so.close()
    else:
        print("[!] exploit-db updated not found")
    print('[+] Searched exp added from remote exploit-db')
    return cves
Beispiel #5
0
def github_exp_add(per_page=100):
    """
    根据(多组)关键词搜索github上cve和exp标记
    更新策略:支持部分存量、增量更新
    :return cve_add:tuple_dict, cve id added in github.com
    """
    # 本地和远程对比->差集
    create_table(db='exp',
                 table='github',
                 key=['cve_id', 'exp_publishedtime', 'label'],
                 primary_key='cve_id')  ## 选择sqlite3 而不是json存储的原因在易于扩展和检索
    so, exist_cid = cve_exists(db='exp', table='github', key=['cve_id'])
    conf = configparser.ConfigParser()
    conf.read('conf/info.conf')
    local_count = conf.get('CVE_Label', 'total_count')
    key_name = conf.get('CVE_Label', 'search_key')
    api_token = conf.get('CVE_Label', 'api_token')

    gh = GitHub(api_token=api_token)
    total_count = max_total_count(gh=gh, kname=key_name, per_page=per_page)

    cve_add = dict()
    if total_count:
        print("[+] Checked, Got github res total count:%s" % total_count)
        add_count = total_count - int(
            local_count)  # todo: if add_count<per_page
        if add_count > 1000:
            add_count = 1000

        print("[+] local res:%s,total res:%s,add res:%s" %
              (local_count, total_count, add_count))
        # 补全差集
        page_count = math.ceil(add_count / per_page)
        if page_count > 0:
            for page in range(1, page_count + 1):
                t, c = single_request(gh=gh,
                                      kname=key_name,
                                      per_page=per_page,
                                      page=page)
                if t and c:
                    cl = list(c.values())
                    for i in cl:
                        if i[0] in exist_cid:
                            del c[i[0]]
                else:
                    print("[!] Error in call_to_the_api")
                if c:
                    #print("[+] Github CVE Added:%s" %c)
                    cve_add = {**cve_add, **c}
        else:
            print("[!] Github updated not found ")

        # 插入sqlite3
        if page_count > 0:
            sql = 'replace INTO github (cve_id,exp_publishedtime,label) VALUES (?, ?, ?)'
            so.executemany(sql, cve_add.values())
            conf.set('CVE_Label', 'total_count', str(total_count))
            with open('conf/info.conf', 'w') as configfile:
                conf.write(configfile)
        else:
            pass
    else:
        print("[Error] Failed to get github total count")
    print('[+] Searched exp added from remote github.com')
    return cve_add
Beispiel #6
0
def exp_model(delta=0):
    """
    exp训练及预测模型
    :return exp_proba: list or []
    """
    exp_proba = exp_proba2 = list()
    # 训练
    so, cve = cve_query(db='cve', table='nvd', key=['*'])
    cve_df = pd.DataFrame(cve, columns=cve_tags)
    x = cve_df['CVE_Items_cve_description_description_data_value'].astype(
        'str')
    y = cve_df['CVE_EXP_label'].astype('int')
    nlp = wordindex(char_level=False)
    fx, fy = nlp.fit_transform(x, y)
    train_x, valid_x, train_y, valid_y = train_test_split(fx,
                                                          fy,
                                                          random_state=2019,
                                                          test_size=0.3)
    model = textcnn(input_type='wordindex',
                    max_len=nlp.max_length,
                    input_dim=nlp.input_dim,
                    output_dim=16,
                    class_num=1)
    model.fit(train_x,
              train_y,
              validation_data=(valid_x, valid_y),
              epochs=1,
              batch_size=128)

    # 测试:预测当天新增CVE
    modified_time = time_delta(delta=delta, format="%Y-%m-%d")
    so, cve = cve_query_where(
        db='cve',
        table='nvd',
        key=['*'],
        where='CVE_Items_publishedDate like "%{}%"'.format(modified_time))
    if cve:
        cve_df = pd.DataFrame(cve, columns=cve_tags)
        x = cve_df['CVE_Items_cve_description_description_data_value'].astype(
            'str')
        fx = nlp.transform(x)
        model.summary()
        pre = model.predict(fx)
        pre = pd.DataFrame(pre)
        exp_proba = pd.concat([
            cve_df[[
                'CVE_Items_cve_CVE_data_meta_ID',
                'CVE_Items_cve_description_description_data_value',
                'CVE_Items_publishedDate'
            ]], pre
        ],
                              axis=1)
        exp_proba.columns = ['CVE_ID', 'Description', 'PubDate', 'EXP_Proba']
        exp_proba = exp_proba.sort_values(by='EXP_Proba', ascending=False)
        exp_proba = exp_proba.values.tolist()
    else:
        print('[INFO] No CVE Today')

    # 测试:预测本月新增CVE
    time = time_delta(format="%Y-%m")
    so, cve = cve_query_where(
        db='cve',
        table='nvd',
        key=['*'],
        where='CVE_Items_publishedDate like "%{}%"'.format(time))
    if cve:
        cve_df = pd.DataFrame(cve, columns=cve_tags)
        x = cve_df['CVE_Items_cve_description_description_data_value'].astype(
            'str')
        fx = nlp.transform(x)
        pre = model.predict(fx)
        pre = pd.DataFrame(pre)
        exp_proba2 = pd.concat([
            cve_df[[
                'CVE_Items_cve_CVE_data_meta_ID',
                'CVE_Items_cve_description_description_data_value',
                'CVE_Items_publishedDate'
            ]], pre
        ],
                               axis=1)
        exp_proba2.columns = ['CVE_ID', 'Description', 'PubDate', 'EXP_Proba']
        exp_proba2 = exp_proba2.sort_values(by='EXP_Proba', ascending=False)

        so, exist_cid = cve_exists(db='exp', table='exps', key=['cve_id'])
        exp_proba2['Ground_Truth'] = exp_proba2.apply(
            lambda x: 1 if x['CVE_ID'] in exist_cid else 'None', axis=1)
        exp_proba2 = exp_proba2.values.tolist()
    else:
        print('[INFO] No CVE Month')

    return exp_proba, exp_proba2
Beispiel #7
0
def exp_model(epoch, delta=0):
    """
    exp训练及预测模型
    :param todo: 当天当次新增的CVE
    :return exp_proba: list or []
    """
    exp_proba = exp_proba2 = list()
    so, cve = cve_query(db='cve', table='nvd', key=['*'])
    cve_df = pd.DataFrame(cve, columns=cve_tags)
    cve_csv = cve_df[[
        'CVE_Items_cve_CVE_data_meta_ID',
        'CVE_Items_cve_description_description_data_value',
        'CVE_Items_publishedDate', 'CVE_EXP_label'
    ]]
    cve_csv.to_csv('CVE_EXP_2020.csv', index=0)
    # 抑制模型衰减:每月重训练一次
    month_day = time_delta(delta=0, format='%Y-%m-%d')
    if int(month_day.split('-')[2]) == 404:  #ecs内存不足
        print('[+] Retrain model')
        x = cve_df['CVE_Items_cve_description_description_data_value'].astype(
            'str')
        y = cve_df['CVE_EXP_label'].astype('int')
        nlp = wordindex(char_level=False)
        fx, fy = nlp.fit_transform(x, y)
        joblib.dump(nlp, 'data/model/nlp.h5')
        train_x, valid_x, train_y, valid_y = train_test_split(
            fx, fy, random_state=2019, test_size=0.3)
        model = textcnn(input_type='wordindex',
                        max_len=nlp.max_length,
                        input_dim=nlp.input_dim,
                        output_dim=16,
                        class_num=1)
        model.fit(train_x,
                  train_y,
                  validation_data=(valid_x, valid_y),
                  epochs=1,
                  batch_size=128)
        joblib.dump(model, 'data/model/textcnn.h5')

    print('[+] Load predict model')
    nlp = joblib.load('data/model/nlp.h5')
    model = joblib.load('data/model/textcnn.h5')
    # 测试:预测当天当次新增CVE
    if epoch:
        cve_epoch = pd.DataFrame(
            list(epoch.values()),
            columns=[
                'CVE_Items_cve_CVE_data_meta_ID',
                'CVE_Items_cve_description_description_data_value',
                'CVE_Items_publishedDate', 'CVE_Items_lastModifiedDate',
                'Source'
            ])
        print(cve_epoch)
        x = cve_epoch[
            'CVE_Items_cve_description_description_data_value'].astype('str')
        fx = nlp.transform(x)
        #model.summary()
        pre = model.predict(fx)
        pre = pd.DataFrame(pre)
        epoch_exp_proba = pd.concat([
            cve_epoch[[
                'CVE_Items_cve_CVE_data_meta_ID',
                'CVE_Items_cve_description_description_data_value',
                'CVE_Items_publishedDate'
            ]], pre
        ],
                                    axis=1)
        epoch_exp_proba.columns = [
            'CVE_ID', 'Description', 'PubDate', 'EXP_Proba'
        ]
        epoch_exp_proba = epoch_exp_proba.sort_values(by='EXP_Proba',
                                                      ascending=False)
        epoch_exp_proba = epoch_exp_proba.values.tolist()
    else:
        epoch_exp_proba = None
        print('[INFO] No CVE Today Epoch')

    # 测试:预测当天新增CVE
    cve = []
    for d in delta:
        modified_time = time_delta(delta=d, format="%Y-%m-%d")
        so, tmp = cve_query_where(
            db='cve',
            table='nvd',
            key=['*'],
            where='CVE_Items_publishedDate like "%{}%"'.format(modified_time))
        cve = cve + tmp
    if cve:
        cve_df = pd.DataFrame(cve, columns=cve_tags)
        x = cve_df['CVE_Items_cve_description_description_data_value'].astype(
            'str')
        fx = nlp.transform(x)
        #model.summary()
        pre = model.predict(fx)
        pre = pd.DataFrame(pre)
        day_exp_proba = pd.concat([
            cve_df[[
                'CVE_Items_cve_CVE_data_meta_ID',
                'CVE_Items_cve_description_description_data_value',
                'CVE_Items_publishedDate'
            ]], pre
        ],
                                  axis=1)
        day_exp_proba.columns = [
            'CVE_ID', 'Description', 'PubDate', 'EXP_Proba'
        ]
        day_exp_proba = day_exp_proba.sort_values(by='EXP_Proba',
                                                  ascending=False)
        day_exp_proba = day_exp_proba.values.tolist()
    else:
        day_exp_proba = None
        print('[INFO] No CVE Today')

    # 测试:预测本月新增CVE
    time = time_delta(format="%Y-%m")
    so, cve = cve_query_where(
        db='cve',
        table='nvd',
        key=['*'],
        where='CVE_Items_publishedDate like "%{}%"'.format(time))
    if cve:
        cve_df = pd.DataFrame(cve, columns=cve_tags)
        x = cve_df['CVE_Items_cve_description_description_data_value'].astype(
            'str')
        fx = nlp.transform(x)
        pre = model.predict(fx)
        pre = pd.DataFrame(pre)
        month_exp_proba = pd.concat([
            cve_df[[
                'CVE_Items_cve_CVE_data_meta_ID',
                'CVE_Items_cve_description_description_data_value',
                'CVE_Items_publishedDate'
            ]], pre
        ],
                                    axis=1)
        month_exp_proba.columns = [
            'CVE_ID', 'Description', 'PubDate', 'EXP_Proba'
        ]
        month_exp_proba = month_exp_proba.sort_values(by='EXP_Proba',
                                                      ascending=False)

        so, exist_cid = cve_exists(db='exp', table='exps', key=['cve_id'])
        month_exp_proba['Ground_Truth'] = month_exp_proba.apply(
            lambda x: 1 if x['CVE_ID'] in exist_cid else 'None', axis=1)
        month_exp_proba = month_exp_proba.values.tolist()
    else:
        month_exp_proba = None
        print('[INFO] No CVE Month')
    with open(path('../data/log', 'cveflow.log'), 'a+') as f:
        f.write('[Done] CVE EXP Prediction')
    return epoch_exp_proba, day_exp_proba, month_exp_proba