Example #1
0
def generate_lookup_table():
    db_driver = idb.ISSuedb()
    output = db_driver.db_retrieve(
        "select name from sqlite_master where type='table' order by name;")
    table_dict = {i[0].replace("$", "_"): i[0] for i in output}

    file_list = os.listdir(SRC_DIR)
    file_list = [os.path.join(SRC_DIR, f) for f in file_list]

    file_list_test = os.listdir(TEST_DIR)
    file_list_test = [os.path.join(TEST_DIR, f) for f in file_list_test]

    files = file_list + file_list_test
    files_dict = {i: False for i in files}

    reload = util.Reload(TSV_FILE)

    for item in table_dict:
        flag = False
        for f in files:
            if item in f:
                flag = True
                print("{}\t{}".format(table_dict[item], f))
                files_dict[f] = True
                break
        if not flag:
            print("{}\tNULL".format(table_dict[item]))

    for f in files_dict:
        if not files_dict[f]:
            print("NULL\t{}".format(f))

    db_driver.db_close()
Example #2
0
def query_issue(scan_output, max_depth=4):
    """
    根据 已经排序过的app相似度降序列表 scan_output 搜索所有可能 issue
    ~1分钟得出结果
    :param scan_output: 格式参考 descript()函数的输出
    :param max_depth: 限制搜索深度,取最相似的前几个
    :return: 所有查询
    """
    # TODO 查询的 key 哪里出来的?
    logger = logging.getLogger("StreamLogger")
    rdb = issuedb.ISSuedb()
    sql = """select issue_num, comments, state, title, body, commit_id, labels from {}
                    order by length(body) desc"""
    overall_table = {}
    # 所有相关app和item
    for i in range(min(len(scan_output), max_depth)):
        one_dict = {}
        app = scan_output[i][0]
        one_dict['sim'] = scan_output[i][1]

        tab_name = table2tsv.file2table(app)
        one_dict['data'] = []
        one_dict['keys'] = []

        score_list = scan_output[i][2]
        keys_sea = _filter_search_keys(score_list, threshold=0.7)
        logger.debug(f"{app}\t{tab_name}\tsimilar keys length: {len(keys_sea)}")

        output = rdb.db_retrieve(sql.format(tab_name))
        head = ["issue_num", "comments", "state", "title", "body", "commit_id", "labels"]
        f_output = issuedb.retrieve_formatter(head, output)

        title_list = util.get_col(output, head.index('title'))
        body_list = util.get_col(output, head.index('body'))
        label_list = util.get_col(output, head.index('labels'))
        reply_list = util.get_col(output, head.index('issue_num'))
        pre_calc_val = _pre_calc(title_list=title_list,
                                 body_list=body_list,
                                 label_list=label_list,
                                 reply_list=reply_list,
                                 keys_sea=keys_sea)

        for k in keys_sea:
            keys = []
            for i in k:
                keys.append(" ".join(i))
            keys = " ".join(keys)
            ess_keys = nlp_util.stem_sentence(keys)

            tmp = search_rank.sort_candidate_seq(f_output, ess_keys, pre_calc_val)
            leng = min(3, len(tmp))
            one_dict['keys'].extend([ess_keys] * leng)
            one_dict['data'].extend(tmp[:leng])
        overall_table[tab_name] = one_dict
        logger.debug(pp.pformat(overall_table))
        logger.debug("#" * 50)
    return overall_table
Example #3
0
def rank_review(app_score_list: list, max_depth=4) -> list:
    hot_keywords, two_keywords = get_keywords()
    rdb = issuedb.ISSuedb()  
    all_review = []
    # number = [5000, 10000, 15000, 20000]
    # number = [1000, 2000, 3000, 4000]
    for m in range(min(len(app_score_list), max_depth)):
        score_list = app_score_list[m][2]
        app_weight = app_score_list[m][1]

        keys_sea = _filter_search_keys(score_list, threshold=0.7)
        ess_keys = set()
        for r in keys_sea:
            for a_list in r:
                ess_keys = ess_keys.union(a_list)
        ess_keys = " ".join(list(ess_keys))
        ess_keys = nlp_util.stem_sentence(ess_keys)
        ess_keys = set(ess_keys)
        app = app_score_list[m][0]
        app_name = os.path.basename(app)[:-4]
        score = {
            'star_num': 0,
            'hot_key_words': 0,
            'helpful_num': 0,
            'ui_key': 0,
            'similar_app': 0,  # app相似度
            'two_gram_keywords': 0,
        }
        sql = """select review_id,content,star_num,helpful_num from {} order by length(content) desc"""
        tab_name = table2tsv.file2table(app)  # csv -> 数据库名字
        output = rdb.db_retrieve(sql.format(tab_name))  # sql查询结果
        # head = ["review_id", "content", "bold", "star_num", "helpful_num", "reply_content"]
        head = ["review_id", "content", "star_num", "helpful_num"]
        f_output = issuedb.retrieve_formatter(head, output)
        # f_output[0].review_id
        for i in f_output:
            if len(i.content) < 100:
                break
            processed_content = nlp_process(i.content)  # 没有移除数字
            score_sum = 0
            score['star_num'] = star_score[i.star_num]
            score['hot_key_words'] = keywords_in_content(hot_keywords, processed_content, False) * app_weight  # 关键词计分
            score['ui_key_words'] = ui_key_word(ess_keys, processed_content) * app_weight
            # score['two_gram_keywords'] = two_gram_key_word(two_keywords, processed_content)
            score['helpful_num'] = int(i.helpful_num) * 0.25  # bug TypeError: can't multiply sequence by non-int of type 'float'
            if score['helpful_num'] > 25:
                score['helpful_num'] = 25
            for k in score:
                score_sum += score[k]
            if score_sum > 3:  # 3 2 1
                all_review.append([app_name, score_sum, i])
            # if len(all_review) > number[m]:
            #     break
    # 然后对all_review进行排序
    result = sorted(all_review, key=itemgetter(1), reverse=True)
    return result[:400]
Example #4
0
File: api.py Project: WuYff/Bugine
def query_issue(scan_output, max_depth=4):
    """
    根据 已经排序过的app相似度降序列表 scan_output 搜索所有可能 issue
    ~1分钟得出结果
    :param scan_output: 格式参考 descript()函数的输出
    :param max_depth: 限制搜索深度,取最相似的前几个
    :return: 所有查询
    """
    # TODO 查询的 key 哪里出来的?
    logger = logging.getLogger("StreamLogger")
    rdb = issuedb.ISSuedb()
    # sql = """select review_id,content,bold,star_num,helpful_num,reply_content from {}
    #                 order by length(content) desc"""
    sql = """select review_id,content,star_num from {}
                       order by length(content) desc"""
    overall_table = {}
    # 所有相关app和item
    for i in range(min(len(scan_output), max_depth)):
        one_dict = {}
        app = scan_output[i][0]
        one_dict['sim'] = scan_output[i][1]  # similarity_score
        # print(app)
        tab_name = table2tsv.file2table(app)  # suppose running
        print("@@@@@@@@@@@@")
        print(tab_name)
        one_dict['data'] = []
        one_dict['keys'] = []

        score_list = scan_output[i][2]
        keys_sea = _filter_search_keys(score_list, threshold=0.7)
        logger.debug(f"{app}\t{tab_name}\tsimilar keys length: {len(keys_sea)}")

        output = rdb.db_retrieve(sql.format(tab_name))  # output is list of tuple

        # head = ["review_id", "content", "bold", "star_num", "helpful_num", "reply_content"]
        head = ["review_id", "content", "star_num"]
        f_output = issuedb.retrieve_formatter(head, output)

        # title_list = util.get_col(output, head.index('title'))
        body_list = util.get_col(output, head.index('content'))

        star_list = util.get_col(output, head.index('star_num'))
        # reply_list = util.get_col(output, head.index('reply_content'))
        # bold_list = util.get_col(output, head.index('bold'))
        # label_list = util.get_col(output, head.index('labels'))
        # reply_list = util.get_col(output, head.index('issue_num'))
        pre_calc_val = _pre_calc(body_list=body_list, keys_sea=keys_sea)  # suppose running

        for k in keys_sea:
            keys = []
            for i in k:
                keys.append(" ".join(i))
            keys = " ".join(keys)
            ess_keys = nlp_util.stem_sentence(keys)
            tmp = search_rank.sort_candidate_seq(f_output, ess_keys, pre_calc_val)
            leng = min(3, len(tmp))
            one_dict['keys'].extend([ess_keys] * leng)
            one_dict['data'].extend(tmp[:leng])
        overall_table[tab_name] = one_dict
        logger.debug(pp.pformat(overall_table))
        logger.debug("#" * 50)
    return overall_table
Example #5
0
                "coef"] * the_score_detail[i]["val"]

        the_score_detail["total"] = sum(
            [the_score_detail[i]["z_term"] for i in the_score_detail])
        score.append(the_score_detail)

    tmp = zip(corpus, score)
    return list(tmp)


if __name__ == '__main__':
    from model import nlp_util
    from model import issuedb

    reload = util.Reload()
    rdb = issuedb.ISSuedb()
    sql = """select issue_num, comments, state, title, body, commit_id from {} 
    where labels like '%bug%' or commit_id is not null order by length(body) desc"""

    # all_cor = []
    # std_tbs = url_repo.get_std_name_list(github=True)
    # for tb in std_tbs:
    #     output = rdb.db_retrieve(sql.format(tb))
    #     for i in range(len(output)):
    #         tmp = list(output[i])
    #         tmp.insert(0, tb)
    #         tmp = tuple(tmp)
    #         all_cor.append(tmp)
    #
    # # pp.pprint(all_cor)
    #