def computeDevActv(): # 几个重要时间点 time_now = time.time() time_now_str = _strtime_before_days(time_now, 0) time_before_1_window = _strtime_before_days(time_now, EXAMINE_WINDOW) # commits_before_1_window,issues_before_1_window,rel_before_1_window cbw, ibw, rbw = [], [], [] metrics = [cbw, ibw, rbw] for repo in REPOS: if repo in NONE_GH: cbw.append(None) ibw.append(None) rbw.append(None) else: # 几个重要集合 cbw.append( dbop.select_one( "select count(*) from commits_info where repo_id=%s and (author_date>%s and author_date<%s)", (repo, time_before_1_window, time_now_str))[0]) ibw.append( dbop.select_one( "select count(*) from issues_info where repo_id=%s and (created_at>%s and created_at<%s)", (repo, time_before_1_window, time_now_str))[0]) rbw.append( dbop.select_one( "select count(*) from releases_info where repo_id=%s and (created_at>%s and created_at<%s)", (repo, time_before_1_window, time_now_str))[0]) nor_metrics = [_nor_data(item) for item in metrics] for i in range(0, len(REPOS)): dbop.execute("insert into dev_actv(repo_id,dev,rel) values(%s,%s,%s)", (REPOS[i], _my_avg([nor_metrics[0][i], nor_metrics[1][i] ]), nor_metrics[2][i]))
def computeQualitySub(): # 缺陷修复比例,平均修复时间 repair_ratio, repair_time = [], [] metrics = [repair_ratio, repair_time] for repo in REPOS: if repo in NONE_GH: repair_ratio.append(None) repair_time.append(None) else: # issue_total,done result = dbop.select_all( "select closed_at,created_at from issues_info where repo_id=%s and is_pr=0", (repo, )) total_num = len(result) if total_num == 0: tmp_repair_ratio = 0 tmp_repair_time = 0 else: issue_done = [item for item in result if item[0] is not None] tmp_repair_ratio = len(issue_done) * 1.0 / total_num tmp_repair_time = sum([ _datetime2int(item[0]) - _datetime2int(item[1]) for item in issue_done ]) * 1.0 / len(issue_done) repair_ratio.append(tmp_repair_ratio) repair_time.append(1.0 / (tmp_repair_time + 1)) repair_time = _nor_data(repair_time) for i in range(0, len(REPOS)): dbop.execute( "insert into quality_sub(repo_id,repair_ratio,repair_time) values(%s,%s,%s)", (REPOS[i], repair_ratio[i], repair_time[i]))
def computeINF(): # 几个时间点 time_now = time.time() time_now_str = _strtime_before_days(time_now, 0) time_before_1_window = _strtime_before_days(time_now, EXAMINE_WINDOW) fans = [[], [], []] fans_fb, fans_tw = [], [] for repo in REPOS: if repo in NONE_GH: #该项目在github上没有 for i in range(0, 3): fans[i].append(None) else: # 开发社区的值 fans_now = dbop.select_one( "select watch,star,fork from html_info where repo_id=%s and fetched_at<=%s order by fetched_at desc limit 1", (repo, time_now_str), (0, 0, 0)) fans_before = dbop.select_one( "select watch,star,fork from html_info where repo_id=%s and fetched_at<=%s order by fetched_at desc limit 1", (repo, time_before_1_window), (0, 0, 0)) # 计算指标变化量, !!还真有变少的, for i in range(0, 3): fans[i].append(fans_now[i] - fans_before[i]) # 社交社区 if repo in NONE_FB: fans_fb.append(None) else: fb_now = dbop.select_one( "select watches_num from facebook_data where coin_id=%s and created_time<=%s order by created_time desc limit 1", (repo, time_now_str), (0, )) fb_before = dbop.select_one( "select watches_num from facebook_data where coin_id=%s and created_time<=%s order by created_time desc limit 1", (repo, time_before_1_window), (0, )) fans_fb.append(fb_now[0] - fb_before[0]) if repo in NONE_TW: fans_tw.append(None) else: tw_now = dbop.select_one( "select followers_num from twitters_data where coin_id=%s and created_time<=%s order by created_time desc limit 1", (repo, time_now_str), (0, )) tw_before = dbop.select_one( "select followers_num from twitters_data where coin_id=%s and created_time<=%s order by created_time desc limit 1", (repo, time_before_1_window), (0, )) fans_tw.append(tw_now[0] - tw_before[0]) # 归一化 fans.extend([fans_fb, fans_tw]) fans = [_nor_data(item) for item in fans] for i in range(0, len(REPOS)): tmp_row = [] for j in range(0, len(fans)): tmp_row.append(fans[j][i]) dbop.execute( "insert into inf(repo_id,inf_dev,inf_social) values(%s,%s,%s)", (REPOS[i], _my_avg(tmp_row[0:3]), _my_avg(tmp_row[3:])))
def _fetchCommitJson4prj(prj): last_page, last_data_set = _get_last_commit_fetch(prj) logger.info("\t\t%s: %s last commit page: %s/%s"%( threading.current_thread().name,prj,last_page,len(last_data_set))) # commit(id,repo_id,sha,author_id,author_name,author_date,committer_id,committer_name,committer_date,parent) while last_page is not None: # 下载原始并存储原始数据 url = URL_TEMPLATE%(prj,"commits",last_page) result, raw_json = _get_url(url) if result is None: break dbop.execute("insert into commits_json_raw(repo_id, page, raw) values(%s,%s,%s)", ( REPO_ID[prj], last_page, raw_json)) new_data_set = json.loads(raw_json) # 抽取 logger.info("\t\t%s: %s new commit page: %s/%s"%( threading.current_thread().name,prj,last_page,len(new_data_set))) for n_data in new_data_set: if n_data["sha"] not in last_data_set: parents_sha = ";".join([parent["sha"] for parent in n_data["parents"]]) author = n_data["author"] # author在github的用户名有时为空 if author is not None and len(author)>0: author_id,author_name = author["id"],author["login"] else: author_id,author_name = None,None committer = n_data["committer"] if committer is not None and len(committer)>0: commit_id,commiter_name = committer["id"],committer["login"] else: commit_id,commiter_name = None,None dbop.execute("insert into commits_info(" + "repo_id,page,sha,author_id,author_name,author_date,committer_id,committer_name,committer_date,parents)" + " values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)", ( REPO_ID[prj],last_page,n_data["sha"], author_id,author_name,_clean_datetime(n_data["commit"]["author"]["date"]), commit_id,commiter_name,_clean_datetime(n_data["commit"]["committer"]["date"]),parents_sha)) # 以后的last_data_set 应该为空 last_data_set = [] # 获取下一个列表页url if 'link' not in result.headers.keys(): logger.info("\t\t%s: %s maybe has less 100 commits"%(threading.current_thread().name, prj)) break links = result.headers["link"] if "next" in links: last_page += 1 else: last_page = None logger.info("\t\t%s: %s no longer have next link for commits"%(threading.current_thread().name,prj))
def computeScore(): M1, M2, M3, M4, M5, M6 = {}, {}, {}, {}, {}, {} score = [] dateTime = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) for repo in REPOS: M1[repo] = _my_avg( dbop.select_one( "select inf_dev,inf_social from inf where repo_id=%s and computed_at<=%s order by id limit 1", (repo, dateTime), (0, 0))) M2[repo] = _my_avg( dbop.select_one( "select issue_done, commit_total, age_dev, fans_dev, fans_social from maturity where repo_id=%s and computed_at<=%s order by id limit 1", (repo, dateTime), (0, 0))) M3[repo] = _my_avg( dbop.select_one( "select repair_ratio,repair_time from quality_sub where repo_id=%s and computed_at<=%s order by id limit 1", (repo, dateTime), (0, 0))) M4[repo] = _my_avg( dbop.select_one( "select ccr,ngr,tbr from team_health where repo_id=%s and computed_at<=%s order by id limit 1", (repo, dateTime), (0, 0))) M5[repo] = _my_avg( dbop.select_one( "select dev,rel from dev_actv where repo_id=%s and computed_at<=%s order by id limit 1", (repo, dateTime), (0, 0))) M6[repo] = _my_avg( dbop.select_one( "select dit,tit,dcpt,ucpt from trend where repo_id=%s and computed_at<=%s order by id limit 1", (repo, dateTime), (0, 0))) score.append( (repo, _my_avg( [M1[repo], M2[repo], M3[repo], M4[repo], M5[repo], M6[repo]]))) score = sorted(score, key=lambda x: x[1], reverse=True) field_sql_str = "prj_id,rank,score,m1_inf,m2_maturity,m3_quality,m4_team_healty,m5_activatin,m6_trend" for i in range(0, len(score)): repo, r_score = score[i] dbop.execute( "insert into daily_rank(" + field_sql_str + ") values(%s" + ",%s" * 8 + ")", (repo, i + 1, r_score, M1[repo], M2[repo], M3[repo], M4[repo], M5[repo], M6[repo]))
def _fetchIssueJson4Prj(prj): last_page, last_data_set = _get_last_issue_fetch(prj) logger.info("\t\t%s: %s last issue page:%s/%s"%( threading.current_thread().name,prj,last_page,len(last_data_set))) while last_page is not None: # 下载原始并存储原始数据 url = URL_TEMPLATE%(prj,"issues",last_page) result, raw_json = _get_url(url) if result is None: break dbop.execute("insert into issues_json_raw(repo_id, page, raw) values(%s,%s,%s)", ( REPO_ID[prj], last_page, raw_json)) new_data_set = json.loads(raw_json) # 抽取 logger.info("\t\t%s: %s new issue page:%s/%s"%( threading.current_thread().name,prj,last_page,len(new_data_set))) for n_data in new_data_set: if n_data["number"] not in last_data_set: is_pr = 0 if "pull_request" in n_data.keys(): is_pr = 1 dbop.execute("insert into issues_info" + "(repo_id,number,page,is_pr,created_at,closed_at,user_id,user_name) values (%s,%s,%s,%s,%s,%s,%s,%s)", ( REPO_ID[prj],n_data["number"],last_page,is_pr, _clean_datetime(n_data["created_at"]), _clean_datetime(n_data["closed_at"]),n_data["user"]["id"],n_data["user"]["login"] )) # 以后的last_data_set 应该为空 last_data_set = [] # 获取下一个列表页url if 'link' not in result.headers.keys(): logger.info("\t\t%s: %s maybe has less 100 isse"%(threading.current_thread().name, prj)) break links = result.headers["link"] if "next" in links: last_page += 1 else: last_page = None logger.info("\t\t%s: %s no longer have next link for issue"%(threading.current_thread().name,prj))
def fetchHtmlInfo(prj): logger.info("\t\tfetchHtmlInfo:%s" % (prj)) # 下载 logger.info("\t\t download") url = "https://github.com/%s" % prj url_result = _get_url(url) if url_result[0] is None: dbop.execute("insert into html_error(repo_id,error_msg) values(%s,%s)", (REPO_ID[prj], url_result[1])) return logger.info("\t\t extract") # 抽取 ini_html = url_result[0] nums, errors = _extract_html(ini_html) logger.info("\t\t store") # 持久化 fields = nums.keys() if len(fields) > 0: values = [nums[field] for field in fields] values.insert(0, "%d" % REPO_ID[prj]) dbop.execute( "insert into html_info(repo_id," + ",".join(fields) + ") values(%s" + ",%s" * len(fields) + ")", (values)) for error in errors: dbop.execute("insert into html_error(repo_id,error_msg) values(%s,%s)", (REPO_ID[prj], error))
def _fetchReleaseJson4Prj(prj): last_page, last_data_set = _get_last_release_fetch(prj) logger.info("\t\t%s: %s last release page: %s/%s"%( threading.current_thread().name,prj,last_page,len(last_data_set))) while last_page is not None: # 下载原始并存储原始数据 url = URL_TEMPLATE%(prj,"releases",last_page) result, raw_json = _get_url(url) if result is None: break dbop.execute("insert into releases_json_raw(repo_id, page, raw) values(%s,%s,%s)", ( REPO_ID[prj], last_page, raw_json)) new_data_set = json.loads(raw_json) # 抽取 logger.info("\t\t%s: %s new release page: %s/%s"%( threading.current_thread().name,prj,last_page,len(new_data_set))) for n_data in new_data_set: if n_data["id"] not in last_data_set: dbop.execute("insert into releases_info(" + "repo_id,r_id,page,tag_name,name,created_at,published_at,author_id,author_name)" + " values(%s,%s,%s,%s,%s,%s,%s,%s,%s)", ( REPO_ID[prj],n_data["id"],last_page,n_data["tag_name"],n_data["name"],_clean_datetime(n_data["created_at"]), _clean_datetime(n_data["published_at"]),n_data["author"]["id"],n_data["author"]["login"])) # 以后的last_data_set 应该为空 last_data_set = [] # 获取下一个列表页url if 'link' not in result.headers.keys(): logger.info("\t\t%s: %s maybe has less 100 releases"%(threading.current_thread().name, prj)) break links = result.headers["link"] if "next" in links: last_page += 1 else: last_page = None logger.info("\t\t%s: %s no longer have next link for releases"%(threading.current_thread().name,prj))
def main(): # 先清空之前的ghurl dbop.execute("update prj_list set github_url=Null") # 更新ghurl字段 with open("prjs.txt", "r") as fp: for prj_line in fp.readlines(): prjls = [item.strip() for item in prj_line.split("\t")] dbop.execute("update prj_list set github_url=%s where prj_id=%s", (prjls[1], prjls[0])) # 加个字段 dbop.execute("alter table prj_list add prj_type varchar(50);") dbop.execute("update prj_list set prj_type='blockchain';")
def computeTrend(): # 几个重要时间点 time_now = time.time() time_now_str = _strtime_before_days(time_now, 0) time_before_1_window = _strtime_before_days(time_now, EXAMINE_WINDOW) time_before_2_window = _strtime_before_days(time_now, 2 * EXAMINE_WINDOW) time_before_3_window = _strtime_before_days(time_now, 3 * EXAMINE_WINDOW) dits, t**s, dcpts, ucpts = [], [], [], [] for repo in REPOS: if repo in NONE_GH: dits.append(None) t**s.append(None) dcpts.append(None) else: # dit commits_before_1_window = dbop.select_one( "select count(*) from commits_info where repo_id=%s and (author_date>%s and author_date<=%s)", (repo, time_before_1_window, time_now_str))[0] commits_before_2_window = dbop.select_one( "select count(*) from commits_info where repo_id=%s and (author_date>%s and author_date<=%s)", (repo, time_before_2_window, time_before_1_window))[0] commits_before_3_window = dbop.select_one( "select count(*) from commits_info where repo_id=%s and (author_date>%s and author_date<=%s)", (repo, time_before_3_window, time_before_2_window))[0] if (commits_before_2_window - commits_before_3_window) == 0: dits.append( ((commits_before_1_window - 2 * commits_before_2_window + commits_before_3_window) + 1.0) / ((commits_before_2_window - commits_before_3_window) + 1.0)) else: dits.append( (commits_before_1_window - 2 * commits_before_2_window + commits_before_3_window) / (commits_before_2_window - commits_before_3_window)) # tit issues_before_1_window = dbop.select_one( "select count(*) from issues_info where repo_id=%s and is_pr=0 and (created_at>%s and created_at<=%s)", (repo, time_before_1_window, time_now_str))[0] issues_before_2_window = dbop.select_one( "select count(*) from issues_info where repo_id=%s and is_pr=0 and (created_at>%s and created_at<=%s)", (repo, time_before_2_window, time_before_1_window))[0] issues_before_3_window = dbop.select_one( "select count(*) from issues_info where repo_id=%s and is_pr=0 and (created_at>%s and created_at<=%s)", (repo, time_before_3_window, time_before_2_window))[0] if (issues_before_2_window - issues_before_3_window) == 0: t**s.append( ((issues_before_1_window - 2 * issues_before_2_window + issues_before_3_window) + 1.0) / ((issues_before_2_window - issues_before_3_window) + 1.0)) else: t**s.append( (issues_before_1_window - 2 * issues_before_2_window + issues_before_3_window) / (issues_before_2_window - issues_before_3_window)) # dcpt fans_before_1_window = _my_sum( dbop.select_one( "select watch,star,fork from html_info where repo_id=%s and fetched_at<=%s order by fetched_at desc limit 1", (repo, time_now_str), (0, 0, 0))) fans_before_2_window = _my_sum( dbop.select_one( "select watch,star,fork from html_info where repo_id=%s and fetched_at<=%s order by fetched_at desc limit 1", (repo, time_before_1_window), (0, 0, 0))) fans_before_3_window = _my_sum( dbop.select_one( "select watch,star,fork from html_info where repo_id=%s and fetched_at<=%s order by fetched_at desc limit 1", (repo, time_before_2_window), (0, 0, 0))) if (fans_before_2_window - fans_before_3_window) == 0: dcpts.append( ((fans_before_1_window - 2 * fans_before_2_window + fans_before_3_window) + 1.0) / (fans_before_2_window - fans_before_3_window + 1.0)) else: dcpts.append((fans_before_1_window - 2 * fans_before_2_window + fans_before_3_window) / (fans_before_2_window - fans_before_3_window)) # UCPT if repo is NONE_FB and repo in NONE_TW: ucpts.append(None) else: fans_before_1_window = _socialfans_till_time(repo, time_now_str) fans_before_2_window = _socialfans_till_time( repo, time_before_1_window) fans_before_3_window = _socialfans_till_time( repo, time_before_2_window) if (fans_before_2_window - fans_before_3_window) == 0: ucpts.append( ((fans_before_1_window - 2 * fans_before_2_window + fans_before_3_window) + 1.0) / (fans_before_2_window - fans_before_3_window + 1.0)) else: ucpts.append((fans_before_1_window - 2 * fans_before_2_window + fans_before_3_window) / (fans_before_2_window - fans_before_3_window)) dits, t**s, dcpts, ucpts = _nor_data(dits), _nor_data(t**s), _nor_data( dcpts), _nor_data(ucpts) for i in range(0, len(REPOS)): dbop.execute( "insert into trend(repo_id,dit,tit,dcpt,ucpt) values(%s,%s,%s,%s,%s)", (REPOS[i], dits[i], t**s[i], dcpts[i], ucpts[i]))
def computeTeamHealth(): # 几个重要时间点 time_now = time.time() time_now_str = _strtime_before_days(time_now, 0) time_before_1_window = _strtime_before_days(time_now, EXAMINE_WINDOW) time_before_2_window = _strtime_before_days(time_now, 2 * EXAMINE_WINDOW) time_before_3_window = _strtime_before_days(time_now, 3 * EXAMINE_WINDOW) ccrs, ngrs, tbrs = [], [], [] for repo in REPOS: if repo in NONE_GH: ccrs.append(None) ngrs.append(None) tbrs.append(None) else: # 几个重要集合 data_before_1_window = set([ item[0] for item in dbop.select_all( "select author_id from commits_info where repo_id=%s and author_id is not null and (author_date>%s and author_date<%s)", (repo, time_before_1_window, time_now_str)) ]) data_before_2_window = set([ item[0] for item in dbop.select_all( "select author_id from commits_info where repo_id=%s and author_id is not null and (author_date>%s and author_date<%s)", (repo, time_before_2_window, time_before_1_window)) ]) data_before_3_window = set([ item[0] for item in dbop.select_all( "select author_id from commits_info where repo_id=%s and author_id is not null and (author_date>%s and author_date<%s)", (repo, time_before_3_window, time_before_2_window)) ]) # ccr data_common = _common_num(data_before_1_window, data_before_2_window) ccrs.append(data_common * 1.0 / (len(data_before_2_window) + 1)) #避免分母为0 # ngr new_users_1 = len(data_before_1_window) - data_common + 1 #避免分母为0 data_common_2 = _common_num(data_before_3_window, data_before_2_window) new_users_2 = len( data_before_2_window) - data_common_2 + 1 #避免分母为0 ngrs.append((new_users_1 - new_users_2) * 1.0 / new_users_2) # tbr 上一个窗口期的 commits_dis = dbop.select_all( "select count(*) from commits_info where repo_id=%s and author_id is not null group by author_id", (repo, )) issues_dis = dbop.select_all( "select count(*) from issues_info where repo_id=%s and user_id is not null group by user_id", (repo, )) tbrs.append(1.0 / (_gini([item[0] for item in commits_dis]) + _gini([item[0] for item in issues_dis]) + 1)) metrics = [] metrics.append(_nor_data(ccrs)) metrics.append(_nor_data(ngrs)) metrics.append(_nor_data(tbrs)) for i in range(0, len(REPOS)): tmp_row = [REPOS[i]] for j in range(0, len(metrics)): tmp_row.append(metrics[j][i]) dbop.execute( "insert into team_health(repo_id, ccr,ngr,tbr) values(%s,%s,%s,%s)", tmp_row)
def computeMaturity(): # maturity: repo_id, issue_done, commit_total, age_dev, fans_dev issue_done, commit_total, age_dev = [], [], [] stars, watchs, forks = [], [], [] fans_fb, fans_tw = [], [] metrics = [ issue_done, commit_total, age_dev, stars, watchs, forks, fans_fb, fans_tw ] # 获取每个指标 for repo_id in REPOS: if repo_id in NONE_GH: issue_done.append(None) commit_total.append(None) age_dev.append(None) stars.append(None) watchs.append(None) forks.append(None) else: # issue_done result = dbop.select_one( "select count(*) from issues_info where repo_id=%s and is_pr=0 and closed_at is not NULL", (repo_id, )) issue_done.append(result[0]) # commit_total result = dbop.select_one( "select count(*) from commits_info where repo_id=%s", (repo_id, )) commit_total.append(result[0]) # age_dev result = dbop.select_all( "select author_date from commits_info where repo_id =%s", (repo_id, )) age_dev.append(_continuous_dev_month(result)) # fans_dev result = dbop.select_one( "select watch,star,fork from html_info where repo_id=%s order by id desc limit 1", (repo_id, ), (0, 0, 0)) stars.append(result[0]) watchs.append(result[1]) forks.append(result[2]) if repo_id in NONE_FB: fans_fb.append(None) else: # fans_social result = dbop.select_one( "select watches_num from facebook_data where coin_id=%s order by id desc limit 1", (repo_id, ), (0, )) fans_fb.append(result[0]) if repo_id in NONE_TW: fans_tw.append(None) else: result = dbop.select_one( "select followers_num from twitters_data where coin_id=%s order by id desc limit 1", (repo_id, ), (0, )) fans_tw.append(result[0]) # 归一化 nor_data = [] for metric in metrics: nor_data.append(_nor_data(metric)) for i in range(0, len(REPOS)): tmp_row = [nor_metric[i] for nor_metric in nor_data] dbop.execute( "insert into maturity(repo_id, issue_done, commit_total, age_dev, fans_dev, fans_social) values(%s,%s,%s,%s,%s,%s)", (REPOS[i], tmp_row[0], tmp_row[1], tmp_row[2], _my_avg(tmp_row[3:-2]), _my_avg(tmp_row[-2:])))
req = urllib2.Request(url,headers = send_headers) try: error_msg = None result = urllib2.urlopen(req,timeout=20) raw_data = result.read().decode('utf-8') logger.info("\t\t%s: downloaded:\t%s:%s"%(threading.current_thread().name,url[28:-37],token[1:8])) except urllib2.HTTPError, e: error_msg = e.code except urllib2.URLError, e: error_msg = e.reason except Exception,e: error_msg = e.message if error_msg != None: dbop.execute("insert into json_error(url,error) values(%s,%s)", (url, error_msg)) logger.info("\t\t%s: error_msg:\t%s,%s:%s"%(threading.current_thread().name,error_msg,url[28:-37],token[1:8])) if retry_times == 0: return None,None else: logger.info("\t\t%s: retry:\t%s:%s"%(threading.current_thread().name,url[28:-37],token[1:8])) return _get_url(url,retry_times-1) return result, raw_data def _get_last_issue_fetch(prj): # 获取上次记录,以及上次获得数据集合 last_page = dbop.select_one("select page from issues_json_raw where repo_id=%s order by id desc limit 1", (REPO_ID[prj],), (1,))[0] last_data_set = set([ item[0] for item in dbop.select_all("select number from issues_info where repo_id=%s and page =%s", (