def getDataForRepository(owner, repo, limit=-1, start=-1): """指定目标owner/repo 获取start到 start - limit编号的pull-request相关评审信息""" """设定start 和 limit""" if start == -1: # 获取项目pull request的数量 这里使用同步方法获取 requestNumber = ApiHelper(owner, repo).getMaxSolvedPullRequestNumberForProject() print("total pull request number:", requestNumber) startNumber = requestNumber else: startNumber = start if limit == -1: limit = startNumber """获取repo信息""" AsyncApiHelper.setRepo(owner, repo) t1 = datetime.now() statistic = statisticsHelper() statistic.startTime = t1 """异步多协程爬虫爬取pull-request信息""" loop = asyncio.get_event_loop() task = [asyncio.ensure_future(AsyncProjectAllDataFetcher.preProcess(loop, limit, start, statistic))] tasks = asyncio.gather(*task) loop.run_until_complete(tasks) print("useful pull request:", statistic.usefulRequestNumber, " useful review:", statistic.usefulReviewNumber, " useful review comment:", statistic.usefulReviewCommentNumber, " useful issue comment:", statistic.usefulIssueCommentNumber, " useful commit:", statistic.usefulCommitNumber, " cost time:", datetime.now() - statistic.startTime)
def getDataForRepository(repo_id, owner, repo, limit, start): """指定目标owner/repo 获取start到 start - limit编号的pull-request相关评审信息""" """获取repo信息 这里的owner就是gitlab中的namespace""" AsyncApiHelper.setRepo(owner, repo) AsyncApiHelper.setRepoId(repo_id) t1 = datetime.now() statistic = statisticsHelper() statistic.startTime = t1 """异步多协程爬虫爬取pull-request信息""" loop = asyncio.get_event_loop() task = [ asyncio.ensure_future( AsyncProjectAllDataFetcher.preProcess(loop, limit, start, statistic)) ] tasks = asyncio.gather(*task) loop.run_until_complete(tasks) print("useful pull request:", statistic.usefulRequestNumber, " useful review:", statistic.usefulReviewNumber, " useful review comment:", statistic.usefulReviewCommentNumber, " useful issue comment:", statistic.usefulIssueCommentNumber, " useful commit:", statistic.usefulCommitNumber, " cost time:", datetime.now() - statistic.startTime)
def getUnmatchedCommitFile(): # 获取 数据库中没有获得file的 commit点,一次最多2000个 t1 = datetime.now() statistic = statisticsHelper() statistic.startTime = t1 loop = asyncio.get_event_loop() task = [asyncio.ensure_future(AsyncProjectAllDataFetcher.preProcessUnmatchCommitFile(loop, statistic))] tasks = asyncio.gather(*task) loop.run_until_complete(tasks) print('cost time:', datetime.now() - t1)
def getUserFollowList(userList): # 获取 给定的用户列表中的用户的 follow 列表 t1 = datetime.now() statistic = statisticsHelper() statistic.startTime = t1 loop = asyncio.get_event_loop() task = [asyncio.ensure_future(AsyncProjectAllDataFetcher.preProcessUserFollowList(loop, statistic, userList))] tasks = asyncio.gather(*task) loop.run_until_complete(tasks) print('cost time:', datetime.now() - t1)
def getNoOriginLineReviewComment(owner, repo, min_num, max_num): # 获取 数据库中没有获得review comment,一次最多2000个 t1 = datetime.now() statistic = statisticsHelper() statistic.startTime = t1 loop = asyncio.get_event_loop() task = [asyncio.ensure_future(AsyncProjectAllDataFetcher.preProcessNoOriginLineReviewComment(loop, statistic, owner, repo, min_num, max_num))] tasks = asyncio.gather(*task) loop.run_until_complete(tasks) print('cost time:', datetime.now() - t1)
def analyzePullRequestReview(pr_timeline_item_groups, pr_author_map): """一次解析多个pr的change_trigger信息""" t1 = datetime.now() statistic = statisticsHelper() statistic.startTime = t1 loop = asyncio.get_event_loop() coro = getMysqlObj(loop) task = loop.create_task(coro) loop.run_until_complete(task) mysql = task.result() loop = asyncio.get_event_loop() tasks = [asyncio.ensure_future( AsyncProjectAllDataFetcher.analyzePRTimeline(mysql, pr_timeline_item_group, statistic, pr_author_map)) for pr_timeline_item_group in pr_timeline_item_groups] tasks = asyncio.gather(*tasks) loop.run_until_complete(tasks) print('cost time:', datetime.now() - t1) return tasks.result()
def getPullRequestTimeLine(owner, repo, nodes): # 获取多个个pull request的时间线上面的信息 并对上面的comment做拼接 AsyncApiHelper.setRepo(owner, repo) t1 = datetime.now() statistic = statisticsHelper() statistic.startTime = t1 semaphore = asyncio.Semaphore(configPraser.getSemaphore()) # 对速度做出限制 loop = asyncio.get_event_loop() coro = getMysqlObj(loop) task = loop.create_task(coro) loop.run_until_complete(task) mysql = task.result() # nodes = np.array(nodes).reshape(10, -1) nodesGroup = [] if nodes.__len__() % 10 == 0: nodesGroup = np.array(nodes).reshape(10, -1) else: offset = 10 - nodes.__len__() % 10 nodes.extend([None for i in range(0, offset)]) nodesGroup = np.array(nodes).reshape(20, -1) # for index in range(0, nodes.__len__(), 10): # if index + 10 < nodes.__len__(): # nodesGroup.append(nodes[index:index + 10]) # else: # nodesGroup.append(nodes[index:nodes.__len__()]) tasks = [ asyncio.ensure_future(AsyncApiHelper.downloadRPTimeLine([x for x in nodegroup.tolist() if x is not None], semaphore, mysql, statistic)) for nodegroup in nodesGroup] # 可以通过nodes 过多次嵌套节省请求数量 tasks = asyncio.gather(*tasks) loop.run_until_complete(tasks) print('cost time:', datetime.now() - t1) return tasks.result()
def getAllDataForProject(owner, repo): helper = ApiHelper(owner=owner, repo=repo) helper.setAuthorization(True) helper.setUseProxyPool(configPraser.getProxy()) statistic = statisticsHelper() statistic.startTime = datetime.now() '''提取项目的信息以及项目的owner信息''' ProjectAllDataFetcher.getDataForRepository(helper) '''提取项目的pull request信息''' # ProjectAllDataFetcher.getPullRequestForRepositoryUseConcurrent(helper, limit=configPraser.getLimit(), # statistic=statistic, # start=configPraser.getStart()) statistic.endTime = datetime.now() print("useful pull request:", statistic.usefulRequestNumber, " useful review:", statistic.usefulReviewNumber, " useful review comment:", statistic.usefulReviewCommentNumber, " useful issue comment:", statistic.usefulIssueCommentNumber, " useful commit:", statistic.usefulCommitNumber, " cost time:", (statistic.endTime - statistic.startTime).seconds)