def getDataForRepository(owner, repo, limit=-1, start=-1): """指定目标owner/repo 获取start到 start - limit编号的pull-request相关评审信息""" """设定start 和 limit""" if start == -1: # 获取项目pull request的数量 这里使用同步方法获取 requestNumber = ApiHelper(owner, repo).getMaxSolvedPullRequestNumberForProject() print("total pull request number:", requestNumber) startNumber = requestNumber else: startNumber = start if limit == -1: limit = startNumber """获取repo信息""" AsyncApiHelper.setRepo(owner, repo) t1 = datetime.now() statistic = statisticsHelper() statistic.startTime = t1 """异步多协程爬虫爬取pull-request信息""" loop = asyncio.get_event_loop() task = [asyncio.ensure_future(AsyncProjectAllDataFetcher.preProcess(loop, limit, start, statistic))] tasks = asyncio.gather(*task) loop.run_until_complete(tasks) print("useful pull request:", statistic.usefulRequestNumber, " useful review:", statistic.usefulReviewNumber, " useful review comment:", statistic.usefulReviewCommentNumber, " useful issue comment:", statistic.usefulIssueCommentNumber, " useful commit:", statistic.usefulCommitNumber, " cost time:", datetime.now() - statistic.startTime)
def getDataForRepository(repo_id, owner, repo, limit, start): """指定目标owner/repo 获取start到 start - limit编号的pull-request相关评审信息""" """获取repo信息 这里的owner就是gitlab中的namespace""" AsyncApiHelper.setRepo(owner, repo) AsyncApiHelper.setRepoId(repo_id) t1 = datetime.now() statistic = statisticsHelper() statistic.startTime = t1 """异步多协程爬虫爬取pull-request信息""" loop = asyncio.get_event_loop() task = [ asyncio.ensure_future( AsyncProjectAllDataFetcher.preProcess(loop, limit, start, statistic)) ] tasks = asyncio.gather(*task) loop.run_until_complete(tasks) print("useful pull request:", statistic.usefulRequestNumber, " useful review:", statistic.usefulReviewNumber, " useful review comment:", statistic.usefulReviewCommentNumber, " useful issue comment:", statistic.usefulIssueCommentNumber, " useful commit:", statistic.usefulCommitNumber, " cost time:", datetime.now() - statistic.startTime)
async def preProcess(loop, limit, start, statistic): """准备工作""" semaphore = asyncio.Semaphore(configPraser.getSemaphore()) # 对速度做出限制 """多协程""" tasks = [asyncio.ensure_future(AsyncApiHelper.downloadInformation(pull_number, semaphore, statistic)) for pull_number in range(start, max(start - limit, 0), -1)] await asyncio.wait(tasks)
async def preProcess(loop, limit, start, statistic): """准备工作""" semaphore = asyncio.Semaphore(configPraser.getSemaphore()) # 对速度做出限制 """初始化数据库""" mysql = await getMysqlObj(loop) if configPraser.getPrintMode(): print("mysql init success") """多协程""" if configPraser.getApiVersion() == StringKeyUtils.API_VERSION_RESET: tasks = [asyncio.ensure_future(AsyncApiHelper.downloadInformation(pull_number, semaphore, mysql, statistic)) for pull_number in range(start, max(start - limit, 0), -1)] elif configPraser.getApiVersion() == StringKeyUtils.API_VERSION_GRAPHQL: tasks = [ asyncio.ensure_future(AsyncApiHelper.downloadInformationByV4(pull_number, semaphore, mysql, statistic)) for pull_number in range(start, max(start - limit, 0), -1)] await asyncio.wait(tasks)
async def preProcessUserFollowList(loop, statistic, userList): semaphore = asyncio.Semaphore(configPraser.getSemaphore()) # 对速度做出限制 mysql = await getMysqlObj(loop) if configPraser.getPrintMode(): print("mysql init success") tasks = [asyncio.ensure_future(AsyncApiHelper.downloadUserFollowList(login, semaphore, mysql, statistic)) for login in userList] # 可以通过nodes 过多次嵌套节省请求数量 await asyncio.wait(tasks)
def getPullRequestTimeLine(owner, repo, nodes): # 获取多个个pull request的时间线上面的信息 并对上面的comment做拼接 AsyncApiHelper.setRepo(owner, repo) t1 = datetime.now() statistic = statisticsHelper() statistic.startTime = t1 semaphore = asyncio.Semaphore(configPraser.getSemaphore()) # 对速度做出限制 loop = asyncio.get_event_loop() coro = getMysqlObj(loop) task = loop.create_task(coro) loop.run_until_complete(task) mysql = task.result() # nodes = np.array(nodes).reshape(10, -1) nodesGroup = [] if nodes.__len__() % 10 == 0: nodesGroup = np.array(nodes).reshape(10, -1) else: offset = 10 - nodes.__len__() % 10 nodes.extend([None for i in range(0, offset)]) nodesGroup = np.array(nodes).reshape(20, -1) # for index in range(0, nodes.__len__(), 10): # if index + 10 < nodes.__len__(): # nodesGroup.append(nodes[index:index + 10]) # else: # nodesGroup.append(nodes[index:nodes.__len__()]) tasks = [ asyncio.ensure_future(AsyncApiHelper.downloadRPTimeLine([x for x in nodegroup.tolist() if x is not None], semaphore, mysql, statistic)) for nodegroup in nodesGroup] # 可以通过nodes 过多次嵌套节省请求数量 tasks = asyncio.gather(*tasks) loop.run_until_complete(tasks) print('cost time:', datetime.now() - t1) return tasks.result()
async def preProcessUnmatchCommits(loop, statistic): semaphore = asyncio.Semaphore(configPraser.getSemaphore()) # 对速度做出限制 mysql = await getMysqlObj(loop) if configPraser.getPrintMode(): print("mysql init success") res = await AsyncSqlHelper.query(mysql, SqlUtils.STR_SQL_QUERY_UNMATCH_COMMITS, None) print(res) tasks = [asyncio.ensure_future(AsyncApiHelper.downloadCommits(item[0], item[1], semaphore, mysql, statistic)) for item in res] # 可以通过nodes 过多次嵌套节省请求数量 await asyncio.wait(tasks)
def testChangeTriggerAnalyzer(owner, repo, pull_request_node): AsyncApiHelper.setRepo(owner, repo) """读取PRTimeline,获取需要分析change_trigger的pr列表""" pr_timeline_filename = projectConfig.getPRTimeLineDataPath() + os.sep + f'ALL_{repo}_data_prtimeline.tsv' pr_timeline_df = pandasHelper.readTSVFile(fileName=pr_timeline_filename, header=pandasHelper.INT_READ_FILE_WITH_HEAD) pr_nodes = list(set(list(pr_timeline_df['pullrequest_node']))) pr_nodes.sort() """按照爬取限制取子集""" pr_timeline_items = pr_timeline_df[pr_timeline_df['pullrequest_node'] == pull_request_node] """对子集按照pull_request_node分组""" grouped_timeline = pr_timeline_items.groupby((['pullrequest_node'])) """将分组结果保存为字典{pr->pr_timeline_items}""" formated_data = [] for pr, group in grouped_timeline: record = group.to_dict(orient='records') record = sorted(record, key=lambda x: int(x.get(StringKeyUtils.STR_KEY_POSITION))) formated_data.append(record) """分析这些pr的timeline""" pr_change_trigger_comments = AsyncProjectAllDataFetcher.analyzePullRequestReview(formated_data) print("finish!")
async def preProcessUnmatchCommitFile(loop, statistic): semaphore = asyncio.Semaphore(configPraser.getSemaphore()) # 对速度做出限制 mysql = await getMysqlObj(loop) if configPraser.getPrintMode(): print("mysql init success") print("mysql init success") fetch_size = 2000 total = await AsyncSqlHelper.query(mysql, SqlUtils.STR_SQL_QUERY_UNMATCH_COMMIT_FILE_COUNT_BY_HAS_FETCHED_FILE, None) fetch_loop = int(total[0][0] / fetch_size) for i in range(0, fetch_loop): start = random.randint(0, fetch_loop - 1) res = await AsyncSqlHelper.query(mysql, SqlUtils.STR_SQL_QUERY_UNMATCH_COMMIT_FILE_BY_HAS_FETCHED_FILE, [start * fetch_size]) print(res) tasks = [ asyncio.ensure_future(AsyncApiHelper.downloadCommits(item[0], item[1], semaphore, mysql, statistic)) for item in res] # 可以通过nodes 过多次嵌套节省请求数量 await asyncio.wait(tasks)
async def preProcessNoOriginLineReviewComment(loop, statistic, owner, repo, min_num, max_num): semaphore = asyncio.Semaphore(configPraser.getSemaphore()) # 对速度做出限制 mysql = await getMysqlObj(loop) if configPraser.getPrintMode(): print("mysql init success") print("mysql init success") repoName = owner + '/' + repo values = [repoName, repoName, min_num, max_num] total = await AsyncSqlHelper.query(mysql, SqlUtils.STR_SQL_QUERY_NO_ORIGINAL_LINE_REVIEW_COMMENT_COUNT, values) fetch_loop = math.ceil(total[0][0] / 2000) for i in range(0, fetch_loop): res = await AsyncSqlHelper.query(mysql, SqlUtils.STR_SQL_QUERY_NO_ORIGINAL_LINE_REVIEW_COMMENT , values) print("fetched size:", res.__len__()) tasks = [asyncio.ensure_future( AsyncApiHelper.downloadSingleReviewComment(repoName, item[0], semaphore, mysql, statistic)) for item in res] # 可以通过nodes 过多次嵌套节省请求数量 await asyncio.wait(tasks)
def getPRChangeTriggerData(owner, repo): """ 根据 ALL_{repo}_data_prtimeline.tsv 获取pr change_trigger数据 """ AsyncApiHelper.setRepo(owner, repo) """PRTimeLine表头""" PR_CHANGE_TRIGGER_COLUMNS = ["pullrequest_node", "user_login", "comment_node", "comment_type", "change_trigger", "filepath"] """初始化目标文件""" target_filename = projectConfig.getPRTimeLineDataPath() + os.sep + f'ALL_{repo}_data_pr_change_trigger.tsv' target_content = DataFrame(columns=PR_CHANGE_TRIGGER_COLUMNS) # pandasHelper.writeTSVFile(target_filename, target_content, pandasHelper.STR_WRITE_STYLE_APPEND_NEW, # header=pandasHelper.INT_WRITE_WITH_HEADER) """读取PRTimeline,获取需要分析change_trigger的pr列表""" pr_timeline_filename = projectConfig.getPRTimeLineDataPath() + os.sep + f'ALL_{repo}_data_prtimeline.tsv' pr_timeline_df = pandasHelper.readTSVFile(fileName=pr_timeline_filename, header=pandasHelper.INT_READ_FILE_WITH_HEAD) """读取PullRequestData,获取pr所对应的作者""" pr_data_filename = projectConfig.getPullRequestPath() + os.sep + f'ALL_{repo}_data_pullrequest.tsv' pr_data_df = pandasHelper.readTSVFile(fileName=pr_data_filename, header=pandasHelper.INT_READ_FILE_WITH_HEAD) """收集pr已经对应的作者 用于后面过滤属于作者评论""" pr_author_map = {} for index, row in pr_data_df.iterrows(): pr_author_map[row['node_id']] = row['user_login'] pr_nodes = list(set(list(pr_timeline_df['pullrequest_node']))) pr_nodes.sort() # pr_nodes = ['MDExOlB1bGxSZXF1ZXN0MjE5MjEzOTc5'] # 3次reopend # pr_nodes = ['MDExOlB1bGxSZXF1ZXN0MjA0MTk5ODkw'] # pr_nodes = ['MDExOlB1bGxSZXF1ZXN0NDQwOTAxMzk0'] # pr_nodes = ['MDExOlB1bGxSZXF1ZXN0MzE1OTU0NDgw'] # pr外review # pr_nodes = ['MDExOlB1bGxSZXF1ZXN0MTQ3NDczNTIx'] # 普通用例 # pr_nodes = ['MDExOlB1bGxSZXF1ZXN0NDM4NjAzMjk2'] # 超多review # pr_nodes = ['MDExOlB1bGxSZXF1ZXN0Mjg1NzExNTIx'] # pr_nodes = ['MDExOlB1bGxSZXF1ZXN0MTAxNTUwMTcw'] """设置fetch参数""" pos = 0 fetchLimit = 400 size = pr_nodes.__len__() Logger.logi("there are {0} prs need to analyze".format(pr_nodes.__len__())) t1 = datetime.now() while pos < size: print("now:", pos, ' total:', size, 'cost time:', datetime.now() - t1) Logger.logi("start: {0}, end: {1}, all: {2}".format(pos, pos + fetchLimit, size)) """按照爬取限制取子集""" sub_prs = pr_nodes[pos:pos + fetchLimit] pr_timeline_items = pr_timeline_df[pr_timeline_df['pullrequest_node'].isin(sub_prs)] """对子集按照pull_request_node分组""" grouped_timeline = pr_timeline_items.groupby((['pullrequest_node'])) """将分组结果保存为字典{pr->pr_timeline_items}""" formated_data = [] for pr, group in grouped_timeline: record = group.to_dict(orient='records') record = sorted(record, key=lambda x: int(x.get(StringKeyUtils.STR_KEY_POSITION))) formated_data.append(record) """分析这些pr的timeline""" pr_change_trigger_comments = AsyncProjectAllDataFetcher.analyzePullRequestReview(formated_data, pr_author_map) pr_change_trigger_comments = [x for y in pr_change_trigger_comments for x in y] """将分析结果去重并追加到change_trigger表中""" if pr_change_trigger_comments.__len__() > 0: target_content = DataFrame() target_content = target_content.append(pr_change_trigger_comments, ignore_index=True) target_content = target_content[PR_CHANGE_TRIGGER_COLUMNS].copy(deep=True) target_content.drop_duplicates(subset=['pullrequest_node', 'comment_node'], inplace=True, keep='first') if not target_content.empty: pandasHelper.writeTSVFile(target_filename, target_content, pandasHelper.STR_WRITE_STYLE_APPEND_NEW, header=pandasHelper.INT_WRITE_WITHOUT_HEADER) Logger.logi("successfully analyzed {0} prs".format(pos)) pos += fetchLimit