def query_issue(scan_output, max_depth=4): """ 根据 已经排序过的app相似度降序列表 scan_output 搜索所有可能 issue ~1分钟得出结果 :param scan_output: 格式参考 descript()函数的输出 :param max_depth: 限制搜索深度,取最相似的前几个 :return: 所有查询 """ # TODO 查询的 key 哪里出来的? logger = logging.getLogger("StreamLogger") rdb = issuedb.ISSuedb() sql = """select issue_num, comments, state, title, body, commit_id, labels from {} order by length(body) desc""" overall_table = {} # 所有相关app和item for i in range(min(len(scan_output), max_depth)): one_dict = {} app = scan_output[i][0] one_dict['sim'] = scan_output[i][1] tab_name = table2tsv.file2table(app) one_dict['data'] = [] one_dict['keys'] = [] score_list = scan_output[i][2] keys_sea = _filter_search_keys(score_list, threshold=0.7) logger.debug(f"{app}\t{tab_name}\tsimilar keys length: {len(keys_sea)}") output = rdb.db_retrieve(sql.format(tab_name)) head = ["issue_num", "comments", "state", "title", "body", "commit_id", "labels"] f_output = issuedb.retrieve_formatter(head, output) title_list = util.get_col(output, head.index('title')) body_list = util.get_col(output, head.index('body')) label_list = util.get_col(output, head.index('labels')) reply_list = util.get_col(output, head.index('issue_num')) pre_calc_val = _pre_calc(title_list=title_list, body_list=body_list, label_list=label_list, reply_list=reply_list, keys_sea=keys_sea) for k in keys_sea: keys = [] for i in k: keys.append(" ".join(i)) keys = " ".join(keys) ess_keys = nlp_util.stem_sentence(keys) tmp = search_rank.sort_candidate_seq(f_output, ess_keys, pre_calc_val) leng = min(3, len(tmp)) one_dict['keys'].extend([ess_keys] * leng) one_dict['data'].extend(tmp[:leng]) overall_table[tab_name] = one_dict logger.debug(pp.pformat(overall_table)) logger.debug("#" * 50) return overall_table
def descript(query_decp, source_category, except_files=None,extend=False, pool_size=32): """ 生成描述文件 ~1分钟得出结果 :param query_decp: 描述文件矩阵 example line: xml_file_name, class_name, element_name :param except_files: 排除文件关键词,接受字符串或字符串数组 :param pool_size: 并行池大小 :return: a tuple. 得到src app与 数据库每个app的总相似度,按照相似度降序排列. 用作 搜索 app """ query_decp = nlp_util.process_xsv(query_decp) if extend : src_dir = work_path.in_project('./model/data/description_extend_all') else: src_dir = work_path.in_project('./model/data/description') print("PATH!!!! {}".format(src_dir)) logger = logging.getLogger("StreamLogger") file_list = os.listdir(src_dir) file_list = [os.path.join(src_dir, f) for f in file_list] if except_files is not None: tmp = [] rms = [] if type(except_files) == str: for i in file_list: if except_files not in i: tmp.append(i) else: rms.append(i) elif type(except_files) == list or type(except_files) == set: except_files = set(except_files) for i in file_list: flag = False for j in except_files: if j in i: flag = True break if not flag: tmp.append(i) else: rms.append(i) logger.debug(pp.pformat(rms)) file_list = tmp logger.debug(pp.pformat(file_list)) scan_output = _scan_match(source_category, query_decp, file_list, match_name.ngram_compare, [1, 0.5, 0.5], threshold=0.7, pool_size=pool_size) # 得到src app与 数据库每个app的总相似度,按照相似度降序排列。 # tuple( # str "参考APP描述文件名", # float "APP相似度", # list "参考APP的组件相似度" [(请求app组件, 参考app组件,组件相似度)] # ) logger.debug(pp.pformat(util.get_col(scan_output, [0, 1]))) return scan_output
def test(): # import search_rank # import matplotlib.pyplot as plt # import numpy as np # # bins = np.linspace(0, 1, 100) # plt.hist(util.get_col(list_score, 2), bins, density=True, histtype='step', cumulative=-1, label='Empirical') # plt.show() path = "tsv/" filelist = os.listdir(path) filelist.sort(key=lambda x: x.lower()) out_dict = dict() count = 0 for file in filelist: count += 1 full_path = os.path.join(path, file) print(full_path) tmp_out = util.read_tsv(full_path) out_dict[file] = nlp_util.process_tsv(tmp_out) print("file count", count) count = 0 score_distribute_list = [] for i in range(len(filelist)): i_file = filelist[i] for j in range(len(filelist)): if i == j: print("Ignore same file", i_file) continue count += 1 j_file = filelist[j] name = "{}^{}".format(i_file, j_file) if len(out_dict[i_file]) == 0 or len(out_dict[j_file]) == 0: print("EMPTY", name) list_score = [] continue else: list_score = weight_compare_list(out_dict[i_file], out_dict[j_file], ngram_compare) score_col = util.get_col(list_score, 2) score_distribute_list.append((name, copy.deepcopy(score_col))) print("ADD", count, name) util.save_json(score_distribute_list, "score_distribute_list.json")
def query_issue(scan_output, max_depth=4): """ 根据 已经排序过的app相似度降序列表 scan_output 搜索所有可能 issue ~1分钟得出结果 :param scan_output: 格式参考 descript()函数的输出 :param max_depth: 限制搜索深度,取最相似的前几个 :return: 所有查询 """ # TODO 查询的 key 哪里出来的? logger = logging.getLogger("StreamLogger") rdb = issuedb.ISSuedb() # sql = """select review_id,content,bold,star_num,helpful_num,reply_content from {} # order by length(content) desc""" sql = """select review_id,content,star_num from {} order by length(content) desc""" overall_table = {} # 所有相关app和item for i in range(min(len(scan_output), max_depth)): one_dict = {} app = scan_output[i][0] one_dict['sim'] = scan_output[i][1] # similarity_score # print(app) tab_name = table2tsv.file2table(app) # suppose running print("@@@@@@@@@@@@") print(tab_name) one_dict['data'] = [] one_dict['keys'] = [] score_list = scan_output[i][2] keys_sea = _filter_search_keys(score_list, threshold=0.7) logger.debug(f"{app}\t{tab_name}\tsimilar keys length: {len(keys_sea)}") output = rdb.db_retrieve(sql.format(tab_name)) # output is list of tuple # head = ["review_id", "content", "bold", "star_num", "helpful_num", "reply_content"] head = ["review_id", "content", "star_num"] f_output = issuedb.retrieve_formatter(head, output) # title_list = util.get_col(output, head.index('title')) body_list = util.get_col(output, head.index('content')) star_list = util.get_col(output, head.index('star_num')) # reply_list = util.get_col(output, head.index('reply_content')) # bold_list = util.get_col(output, head.index('bold')) # label_list = util.get_col(output, head.index('labels')) # reply_list = util.get_col(output, head.index('issue_num')) pre_calc_val = _pre_calc(body_list=body_list, keys_sea=keys_sea) # suppose running for k in keys_sea: keys = [] for i in k: keys.append(" ".join(i)) keys = " ".join(keys) ess_keys = nlp_util.stem_sentence(keys) tmp = search_rank.sort_candidate_seq(f_output, ess_keys, pre_calc_val) leng = min(3, len(tmp)) one_dict['keys'].extend([ess_keys] * leng) one_dict['data'].extend(tmp[:leng]) overall_table[tab_name] = one_dict logger.debug(pp.pformat(overall_table)) logger.debug("#" * 50) return overall_table
for k in count_dict: score += count_dict[k] return score if __name__ == '__main__': eee = "it.feio.android.omninotes" s = time.strftime("%Y-%m-%d-%H:%M:%S", time.localtime(time.time())) test = util.read_csv("model/data/description_extend_all/"+eee+".csv") print("begin search similar apps") scan_output = descript(test, source_category="Productivity", except_files=eee,extend=True, pool_size=32) # get similar app print("begin rank reviews") rank_result = rank_review(scan_output) print(util.get_col(scan_output, [0, 1])) now = time.strftime("%Y-%m-%d-%H_%M_%S", time.localtime(time.time())) # 1. 创建文件对象 z = open(csv_path +eee+ now + ".csv", 'w', encoding='utf-8', newline='') # # 2. 基于文件对象构建 csv写入对象 csv_writer = csv.writer(z) # # 3. 构建列表头 csv_writer.writerow(["app_id", "score", "star_num", "helpful_num", "review_content"]) for i in rank_result: # # 写入文件 csv_writer.writerow([i[0], i[1], i[2].star_num, i[2].helpful_num, i[2].content]) # 5. 关闭文件 z.close() print("end.") print(s) print(time.strftime("%Y-%m-%d-%H:%M:%S", time.localtime(time.time())))