def generate_lookup_table(): db_driver = idb.ISSuedb() output = db_driver.db_retrieve( "select name from sqlite_master where type='table' order by name;") table_dict = {i[0].replace("$", "_"): i[0] for i in output} file_list = os.listdir(SRC_DIR) file_list = [os.path.join(SRC_DIR, f) for f in file_list] file_list_test = os.listdir(TEST_DIR) file_list_test = [os.path.join(TEST_DIR, f) for f in file_list_test] files = file_list + file_list_test files_dict = {i: False for i in files} reload = util.Reload(TSV_FILE) for item in table_dict: flag = False for f in files: if item in f: flag = True print("{}\t{}".format(table_dict[item], f)) files_dict[f] = True break if not flag: print("{}\tNULL".format(table_dict[item])) for f in files_dict: if not files_dict[f]: print("NULL\t{}".format(f)) db_driver.db_close()
def query_issue(scan_output, max_depth=4): """ 根据 已经排序过的app相似度降序列表 scan_output 搜索所有可能 issue ~1分钟得出结果 :param scan_output: 格式参考 descript()函数的输出 :param max_depth: 限制搜索深度,取最相似的前几个 :return: 所有查询 """ # TODO 查询的 key 哪里出来的? logger = logging.getLogger("StreamLogger") rdb = issuedb.ISSuedb() sql = """select issue_num, comments, state, title, body, commit_id, labels from {} order by length(body) desc""" overall_table = {} # 所有相关app和item for i in range(min(len(scan_output), max_depth)): one_dict = {} app = scan_output[i][0] one_dict['sim'] = scan_output[i][1] tab_name = table2tsv.file2table(app) one_dict['data'] = [] one_dict['keys'] = [] score_list = scan_output[i][2] keys_sea = _filter_search_keys(score_list, threshold=0.7) logger.debug(f"{app}\t{tab_name}\tsimilar keys length: {len(keys_sea)}") output = rdb.db_retrieve(sql.format(tab_name)) head = ["issue_num", "comments", "state", "title", "body", "commit_id", "labels"] f_output = issuedb.retrieve_formatter(head, output) title_list = util.get_col(output, head.index('title')) body_list = util.get_col(output, head.index('body')) label_list = util.get_col(output, head.index('labels')) reply_list = util.get_col(output, head.index('issue_num')) pre_calc_val = _pre_calc(title_list=title_list, body_list=body_list, label_list=label_list, reply_list=reply_list, keys_sea=keys_sea) for k in keys_sea: keys = [] for i in k: keys.append(" ".join(i)) keys = " ".join(keys) ess_keys = nlp_util.stem_sentence(keys) tmp = search_rank.sort_candidate_seq(f_output, ess_keys, pre_calc_val) leng = min(3, len(tmp)) one_dict['keys'].extend([ess_keys] * leng) one_dict['data'].extend(tmp[:leng]) overall_table[tab_name] = one_dict logger.debug(pp.pformat(overall_table)) logger.debug("#" * 50) return overall_table
def rank_review(app_score_list: list, max_depth=4) -> list: hot_keywords, two_keywords = get_keywords() rdb = issuedb.ISSuedb() all_review = [] # number = [5000, 10000, 15000, 20000] # number = [1000, 2000, 3000, 4000] for m in range(min(len(app_score_list), max_depth)): score_list = app_score_list[m][2] app_weight = app_score_list[m][1] keys_sea = _filter_search_keys(score_list, threshold=0.7) ess_keys = set() for r in keys_sea: for a_list in r: ess_keys = ess_keys.union(a_list) ess_keys = " ".join(list(ess_keys)) ess_keys = nlp_util.stem_sentence(ess_keys) ess_keys = set(ess_keys) app = app_score_list[m][0] app_name = os.path.basename(app)[:-4] score = { 'star_num': 0, 'hot_key_words': 0, 'helpful_num': 0, 'ui_key': 0, 'similar_app': 0, # app相似度 'two_gram_keywords': 0, } sql = """select review_id,content,star_num,helpful_num from {} order by length(content) desc""" tab_name = table2tsv.file2table(app) # csv -> 数据库名字 output = rdb.db_retrieve(sql.format(tab_name)) # sql查询结果 # head = ["review_id", "content", "bold", "star_num", "helpful_num", "reply_content"] head = ["review_id", "content", "star_num", "helpful_num"] f_output = issuedb.retrieve_formatter(head, output) # f_output[0].review_id for i in f_output: if len(i.content) < 100: break processed_content = nlp_process(i.content) # 没有移除数字 score_sum = 0 score['star_num'] = star_score[i.star_num] score['hot_key_words'] = keywords_in_content(hot_keywords, processed_content, False) * app_weight # 关键词计分 score['ui_key_words'] = ui_key_word(ess_keys, processed_content) * app_weight # score['two_gram_keywords'] = two_gram_key_word(two_keywords, processed_content) score['helpful_num'] = int(i.helpful_num) * 0.25 # bug TypeError: can't multiply sequence by non-int of type 'float' if score['helpful_num'] > 25: score['helpful_num'] = 25 for k in score: score_sum += score[k] if score_sum > 3: # 3 2 1 all_review.append([app_name, score_sum, i]) # if len(all_review) > number[m]: # break # 然后对all_review进行排序 result = sorted(all_review, key=itemgetter(1), reverse=True) return result[:400]
def query_issue(scan_output, max_depth=4): """ 根据 已经排序过的app相似度降序列表 scan_output 搜索所有可能 issue ~1分钟得出结果 :param scan_output: 格式参考 descript()函数的输出 :param max_depth: 限制搜索深度,取最相似的前几个 :return: 所有查询 """ # TODO 查询的 key 哪里出来的? logger = logging.getLogger("StreamLogger") rdb = issuedb.ISSuedb() # sql = """select review_id,content,bold,star_num,helpful_num,reply_content from {} # order by length(content) desc""" sql = """select review_id,content,star_num from {} order by length(content) desc""" overall_table = {} # 所有相关app和item for i in range(min(len(scan_output), max_depth)): one_dict = {} app = scan_output[i][0] one_dict['sim'] = scan_output[i][1] # similarity_score # print(app) tab_name = table2tsv.file2table(app) # suppose running print("@@@@@@@@@@@@") print(tab_name) one_dict['data'] = [] one_dict['keys'] = [] score_list = scan_output[i][2] keys_sea = _filter_search_keys(score_list, threshold=0.7) logger.debug(f"{app}\t{tab_name}\tsimilar keys length: {len(keys_sea)}") output = rdb.db_retrieve(sql.format(tab_name)) # output is list of tuple # head = ["review_id", "content", "bold", "star_num", "helpful_num", "reply_content"] head = ["review_id", "content", "star_num"] f_output = issuedb.retrieve_formatter(head, output) # title_list = util.get_col(output, head.index('title')) body_list = util.get_col(output, head.index('content')) star_list = util.get_col(output, head.index('star_num')) # reply_list = util.get_col(output, head.index('reply_content')) # bold_list = util.get_col(output, head.index('bold')) # label_list = util.get_col(output, head.index('labels')) # reply_list = util.get_col(output, head.index('issue_num')) pre_calc_val = _pre_calc(body_list=body_list, keys_sea=keys_sea) # suppose running for k in keys_sea: keys = [] for i in k: keys.append(" ".join(i)) keys = " ".join(keys) ess_keys = nlp_util.stem_sentence(keys) tmp = search_rank.sort_candidate_seq(f_output, ess_keys, pre_calc_val) leng = min(3, len(tmp)) one_dict['keys'].extend([ess_keys] * leng) one_dict['data'].extend(tmp[:leng]) overall_table[tab_name] = one_dict logger.debug(pp.pformat(overall_table)) logger.debug("#" * 50) return overall_table
"coef"] * the_score_detail[i]["val"] the_score_detail["total"] = sum( [the_score_detail[i]["z_term"] for i in the_score_detail]) score.append(the_score_detail) tmp = zip(corpus, score) return list(tmp) if __name__ == '__main__': from model import nlp_util from model import issuedb reload = util.Reload() rdb = issuedb.ISSuedb() sql = """select issue_num, comments, state, title, body, commit_id from {} where labels like '%bug%' or commit_id is not null order by length(body) desc""" # all_cor = [] # std_tbs = url_repo.get_std_name_list(github=True) # for tb in std_tbs: # output = rdb.db_retrieve(sql.format(tb)) # for i in range(len(output)): # tmp = list(output[i]) # tmp.insert(0, tb) # tmp = tuple(tmp) # all_cor.append(tmp) # # # pp.pprint(all_cor) #