def java_plugin(g, issue_url): ex_dir, repo_name, issue_dict = prepare_repo(g, issue_url) # Dependency gradle_paths = find_all_gradle(ex_dir) if gradle_paths: raw_depens = [] for p in gradle_paths: tmp = gradle_dependency(p) if tmp: raw_depens.extend(tmp) else: pom_paths = find_all_pom(ex_dir) raw_depens = [] for p in pom_paths: tmp = pom_dependency(p) if tmp: raw_depens.extend(tmp) if raw_depens: # logger.debug(raw_depens) depen_ob = Dependencies(repo_name, raw_depens) # logger.debug(depen_ob) _denp_keys = depen_ob.keywords() # logger.debug(_denp_keys) denp_key_w = search_keywords(issue_dict["body_tokens"], _denp_keys, mode='depen', min_len=2) denp_key_w.sort(key=lambda k: k[-1], reverse=True) logger.info(f"Dependency {len(denp_key_w)}=={denp_key_w}") return { "depen": util.get_col(denp_key_w, 0) } else: return { "depen": list() }
def android_plugin(g, issue_url): ex_dir, repo_name, issue_dict = prepare_repo(g, issue_url) root = find_app_root(ex_dir) # Permission is_path, path = check_manifest_at_root(root) if not is_path: per_key_w = [] else: name, p_list = get_permission(path) permit_ob = Permissions(path, name, ex_dir, p_list) _per_keys = permit_ob.keywords() per_key_w = search_keywords(issue_dict["body_tokens"], _per_keys, mode='permit', min_len=2) per_key_w.sort(key=lambda k: k[-1], reverse=True) logger.info(f"Permission {len(per_key_w)}=={per_key_w}") # Dependency is_path, path = check_gradle_at_root(root) if not is_path: denp_key_w = [] else: a_json = gradle_dependency(path) depen_ob = Dependencies(repo_name, a_json) _denp_keys = depen_ob.keywords() denp_key_w = search_keywords(issue_dict["body_tokens"], _denp_keys, mode='depen', min_len=2) denp_key_w.sort(key=lambda k: k[-1], reverse=True) logger.info(f"Dependency {len(denp_key_w)}=={denp_key_w}") # UI ui_data = get_ui_descript(ex_dir) ui_key_w = search_keywords(issue_dict["body"], ui_data, mode='ui', threshold=0.5, min_len=3) # remove the match rate less than 0.5 logger.info(f"UI {len(ui_key_w)}=={ui_key_w}") return { "depen": util.get_col(denp_key_w, 0), "permit": util.get_col(per_key_w, 0), "ui": util.get_col(ui_key_w, 0) }
def get_top_java_repo(g, id, only_android=False): file_name = f"openissues_repo_{str(id)}.csv" data = util.read_csv(file_name) repos = util.get_col(data, 0) old_repos = set(repos) repos = g.search_repositories(query='language:java', sort="stars", order="desc", language="Java") ilistdict = dict() with open(file_name, "a+", encoding="utf-8") as file: for index in range(repos.totalCount): api_wait_search(g) repo = repos[index] if repo.full_name in old_repos: continue if only_android ^ is_android_repo(g, repo.full_name): # 相同为True logger.info(f"skip {repo.full_name}") continue file.write(repo.full_name + "," + repo.html_url + "\n") file.flush()
def mian(): from persontoken import MY_TOKEN g = Github(MY_TOKEN) done_open_urls = util.read_csv('./hist.txt') done_open_urls = util.get_col(done_open_urls, 0) _f = open("data2.csv", 'a+', encoding='utf-8-sig', newline='') _f2 = open("hist.txt", 'a+', encoding='utf-8', newline='') try: csvwriter = csv.writer(_f, delimiter=',') for i, open_url in enumerate(open_urls): if open_url in done_open_urls: continue this_row = [""] * 5 # open_url = 'https://github.com/json-path/JsonPath/issues/460' logger.info("-" * 100) logger.info(open_url) open_iss_ob = util.get_issue(g, open_url) this_row[0] = open_url # stacktrace / condition / title repo_name = open_iss_ob.repository.full_name extra_rm = nlp_util.full_name_token( repo_name) # remove the number and its name curr_q = fast_query(open_iss_ob.title, open_iss_ob.body) logger.info(f"curr_q, {curr_q}") # check sequence stacktrace, condition, title try_pair = [ (True, False, 'body'), # stacktrace in body (False, True, 'title'), # condition in title (False, False, 'title'), # title in title (False, False, 'other') # title (no field constraint) ] try_hist = [] all_fail = True for _fi, pair in enumerate(try_pair): trace, condition, pos = pair query_list = form_query(curr_q, extra_rm, trace=trace, condition=condition) query_chars = " ".join(query_list) query_chars = add_pos(query_chars, pos) logger.debug(f"query_chars, {query_chars}") if query_list: close_iss = run_close_query(g, query_chars, is_android, depth=10, fallback_size=5) try_hist.append(query_chars) this_row[1] = query_chars if close_iss["info"] == 'NORMAL': all_fail = False else: if close_iss["info"] == 'FALLBACK': all_fail = False logger.info( f"[try {_fi}] FALLBACK failed query [Too few results], {query_chars}" ) elif close_iss["info"] == 'EMPTY': logger.info( f"[try {_fi}] FALLBACK failed query [Zero results], {query_chars}" ) if close_iss["iss"]: # open url, open info online, open info offline, close url, close info (rank property) # 0 , 1 , 2 , 3 , 4 rank_list = [] for _c in close_iss["iss"]: close_url, close_info = _c if is_android: plugin = android_plugin else: plugin = java_plugin # open url, open info online, open info offline, close url, close info (rank property) # 0 , 1 , 2 , 3 , 4 open_off = plugin(g, open_url) close_off = plugin(g, close_url) logger.debug(f"open {open_url}") logger.debug( f"open offline rank, {open_off}=={len(open_off.keys())}" ) logger.debug(f"close {close_url}") logger.debug( f"close offline rank, {close_off}=={len(close_off.keys())}" ) assert len(open_off.keys()) == len( close_off.keys()) all_empty = True join_off = dict() for _k in open_off.keys(): join = set(open_off[_k]) & set(close_off[_k]) join_off[_k] = list(join) if join: all_empty = False close_info.insert(0, f"Off-SIM-{_k}") logger.debug(f"join_off, {join_off}") if all_empty: this_row[2] = "empty offline" else: this_row[2] = json.dumps(join_off) this_row[3] = close_url flag = code_sim_wrap(g, open_url, close_url) if flag: close_info.insert(0, "Code-SIM") this_row[4] = json.dumps(close_info) rank_list.append(deepcopy(this_row)) rank_list = rank_issue(rank_list) if rank_list: csvwriter.writerows(rank_list) if close_iss["info"] == 'NORMAL': break if all_fail: write_list = [] this_row[4] = 'NONE close issue' for col1 in try_hist: this_row[1] = col1 write_list.append(deepcopy(this_row)) csvwriter.writerows(write_list) print(open_url, file=_f2) _f.flush() _f2.flush() except Exception as e: logger.error(f"{open_url}, skip") print(f"{open_url}, skip", file=_f2) raise e finally: _f.close() _f2.close()
from github import Github from loguru import logger import nlp_util import util from crawlermy import fast_query, run_close_query, form_query, add_pos from rank_issue import code_sim_wrap, rank_issue, android_plugin, java_plugin from util import SS # before running, delete data2.csv,hist.txt,log/main_one2.log util.init_logger('main_one2.log', mode='fixed', clear=False) is_android = False open_urls = util.read_csv('./openlist.txt', encoding='utf-8') open_urls = util.get_col(open_urls, 0) open_urls = util.uniq_list(open_urls) ss = SS(ip="vm.userx.cn", port=7891) def mian(): from persontoken import MY_TOKEN g = Github(MY_TOKEN) done_open_urls = util.read_csv('./hist.txt') done_open_urls = util.get_col(done_open_urls, 0) _f = open("data2.csv", 'a+', encoding='utf-8-sig', newline='') _f2 = open("hist.txt", 'a+', encoding='utf-8', newline='') try: csvwriter = csv.writer(_f, delimiter=',')
# test code # src = "tsv/nextcloud_android_master.tsv" # src = select_dir(SRC_DIR) src = _item src_out = util.read_tsv(src) src_out = nlp_util.process_tsv(src_out) file_list = os.listdir(SRC_DIR) file_list = [os.path.join(SRC_DIR, f) for f in file_list] if src in file_list: file_list.remove(src) # file_list = ['tsv/owncloud_android_master.tsv'] # one test scan_output = scan_match(src_out, file_list, match_name.ngram_compare, [1, 0.5, 0.5], threshold=0.7) # 得到src app与数据库每个app的总相似度 logger.debug(pp.pformat(util.get_col(scan_output, [0, 1]))) rdb = issuedb.ISSuedb() sql = """select issue_num, comments, state, title, body, commit_id, labels from {} order by length(body) desc""" # remove constrain "where labels like '%bug%' or commit_id is not null" overall_table = {} # 所有相关app和item # for i in range(len(scan_output)): for i in range(4): one_dict = {} app = scan_output[i][0] one_dict['sim'] = scan_output[i][1] tab_name = table2tsv.file2table(app)
i.html_url) if __name__ == "__main__": init_logger(__file__) from persontoken import MY_TOKENs tklen = len(MY_TOKENs) tk_i = 0 ss = SS(port=7890) android = False id = 12 while True: g = Github(MY_TOKENs[tk_i % tklen]) try: # get_top_java_repo(g, 6, only_android=False) if android: urls = util.read_csv('f-droid/f-droid-github-filter.csv') urls = util.get_col(urls, 3) else: urls = util.read_csv('java_repo_list.csv') urls = util.get_col(urls, 1) download_new_issues(g, urls, id, shuffle=True) except RateLimitExceededException: logger.error(traceback.format_exc()) tk_i += 1 else: logger.error(traceback.format_exc())