def _getPage(self, kwargs, prs): qdata = """{ repository(owner: "%(org)s", name:"%(repository)s") { pullRequests( first: %(size)s %(after)s orderBy: { field: UPDATED_AT, direction: DESC } ) { totalCount pageInfo { hasNextPage endCursor } edges { node { %(pr_query)s } } } } }""" # noqa: E501 data = self.gql.query(qdata % kwargs) if "data" not in data: self.log.error("No data collected: %s" % data) if "message" in data and "wait a few minutes" in data["message"]: self.log.info("sleeping 2 mn") sleep(120) else: self.log.info("sleeping 20 s") sleep(20) return None if not kwargs["total_prs_count"]: kwargs["total_prs_count"] = data["data"]["repository"]["pullRequests"][ "totalCount" ] self.log.info( "Total PRs: %s but will fetch until we reached a PR " "updated at date < %s" % (kwargs["total_prs_count"], kwargs["updated_since"]) ) if kwargs["total_prs_count"] == 0: return False edges = data["data"]["repository"]["pullRequests"]["edges"] for pr in edges: prs.append(pr["node"]) # We sort to mitigate this # https://github.community/t5/GitHub-API-Development-and/apiv4-pullrequests-listing-broken-ordering/m-p/59439#M4968 oldest_update = sorted( [is8601_to_dt(pr["node"]["updatedAt"]) for pr in edges], reverse=True )[-1] logging.info("page oldest updated at date is %s" % oldest_update) if oldest_update < kwargs["updated_since"]: # The crawler reached a page where the oldest updated PR # is oldest than the configured limit return False pageInfo = data["data"]["repository"]["pullRequests"]["pageInfo"] if pageInfo["hasNextPage"]: kwargs["after"] = 'after: "%s"' % pageInfo["endCursor"] return True else: return False
def get_rate_limit(self): ratelimit = self.getRateLimit() if ratelimit: self.quota_remain = ratelimit["remaining"] self.resetat = utils.is8601_to_dt(ratelimit["resetAt"]) self.log.info("Got rate limit data: remain %s resetat %s" % (self.quota_remain, self.resetat))
def _first_event_on_changes(es, index, repository_fullname, params): params = deepcopy(params) def keyfunc(x): return x["change_id"] groups = {} _events = _scan(es, index, repository_fullname, params) _events = sorted(_events, key=lambda k: k["change_id"]) # Keep by Change the created date + first event date for pr, events in groupby(_events, keyfunc): groups[pr] = { "change_created_at": None, "first_event_created_at": utcnow(), "first_event_author": None, "delta": None, } for event in events: if not groups[pr]["change_created_at"]: groups[pr]["change_created_at"] = is8601_to_dt( event["on_created_at"]) event_created_at = is8601_to_dt(event["created_at"]) if event_created_at < groups[pr]["first_event_created_at"]: groups[pr]["first_event_created_at"] = event_created_at groups[pr]["delta"] = (groups[pr]["first_event_created_at"] - groups[pr]["change_created_at"]) groups[pr]["first_event_author"] = event["author"]["muid"] ret = {"first_event_delay_avg": 0, "top_authors": {}} for pr_data in groups.values(): ret["first_event_delay_avg"] += pr_data["delta"].seconds ret["top_authors"].setdefault(pr_data["first_event_author"], 0) ret["top_authors"][pr_data["first_event_author"]] += 1 try: ret["first_event_delay_avg"] = int(ret["first_event_delay_avg"] / len(groups)) except ZeroDivisionError: ret["first_event_delay_avg"] = 0 ret["top_authors"] = sorted( [(k, v) for k, v in ret["top_authors"].items()], key=lambda x: x[1], reverse=True, )[:10] return ret
def _first_event_on_changes(es, index, repository_fullname, params): params = deepcopy(params) def keyfunc(x): return x['change_id'] groups = {} _events = _scan(es, index, repository_fullname, params) _events = sorted(_events, key=lambda k: k['change_id']) # Keep by Change the created date + first event date for pr, events in groupby(_events, keyfunc): groups[pr] = { 'change_created_at': None, 'first_event_created_at': utcnow(), 'first_event_author': None, 'delta': None, } for event in events: if not groups[pr]['change_created_at']: groups[pr]['change_created_at'] = is8601_to_dt(event['on_created_at']) event_created_at = is8601_to_dt(event['created_at']) if event_created_at < groups[pr]['first_event_created_at']: groups[pr]['first_event_created_at'] = event_created_at groups[pr]['delta'] = ( groups[pr]['first_event_created_at'] - groups[pr]['change_created_at'] ) groups[pr]['first_event_author'] = event['author'] ret = {'first_event_delay_avg': 0, 'top_authors': {}} for pr_data in groups.values(): ret['first_event_delay_avg'] += pr_data['delta'].seconds ret['top_authors'].setdefault(pr_data['first_event_author'], 0) ret['top_authors'][pr_data['first_event_author']] += 1 try: ret['first_event_delay_avg'] = int(ret['first_event_delay_avg'] / len(groups)) except ZeroDivisionError: ret['first_event_delay_avg'] = 0 ret['top_authors'] = sorted( [(k, v) for k, v in ret['top_authors'].items()], key=lambda x: x[1], reverse=True, )[:10] return ret
def get( self, updated_since: str, change_id: Optional[str] = None ) -> List[RawChange]: prs: List[RawChange] = [] updated_since = is8601_to_dt(updated_since) get_commits = True kwargs = { "pr_query": self.get_pr_query(include_commits=get_commits), "org": self.org, "repository": self.repository, "updated_since": updated_since, "after": "", "total_prs_count": 0, "size": self.size, } one = 0 while True: self.log.info( "Running request %s" % dict([(k, v) for k, v in kwargs.items() if k != "pr_query"]) ) try: hnp = self._getPage(kwargs, prs) if kwargs["size"] == 1: self.log.debug("Getting this PR, with page size 1: %s" % prs[0]) kwargs["size"] = min(MAX_BULK_SIZE, int(kwargs["size"] * AUGMENT) + 1) one = 0 if not get_commits: self.log.info("Will get full commits on next query.") kwargs["pr_query"] = self.get_pr_query(include_commits=get_commits) get_commits = True except RequestTimeout: kwargs["size"] = max(1, kwargs["size"] // REDUCE) if kwargs["size"] == 1: one += 1 if one == MAX_TRY - 1: self.log.info( "%d timeouts in a raw for one pr, retrying without commits." % (MAX_TRY - 1) ) get_commits = False kwargs["pr_query"] = self.get_pr_query( include_commits=get_commits ) elif one >= MAX_TRY: self.log.info( "%d timeouts in a raw for one pr, giving up." % MAX_TRY ) raise continue self.log.info("%s PRs fetched" % len(prs)) if hnp is False: break return prs
def ensure_gte_lte(es, index, repository_fullname, params): if not params.get("gte"): first_created_event = _first_created_event(es, index, repository_fullname, params) if first_created_event: params["gte"] = int( is8601_to_dt(first_created_event).timestamp() * 1000) else: # There is probably nothing in the db that match the query params["gte"] = None if not params.get("lte"): params["lte"] = int(utcnow().timestamp() * 1000)
def get(self, updated_since): prs = [] updated_since = is8601_to_dt(updated_since) get_commits = True kwargs = { 'pr_query': self.get_pr_query(include_commits=get_commits), 'org': self.org, 'repository': self.repository, 'updated_since': updated_since, 'after': '', 'total_prs_count': 0, 'size': self.size, } one = 0 while True: self.log.info( 'Running request %s' % dict([(k, v) for k, v in kwargs.items() if k != 'pr_query'])) try: hnp = self._getPage(kwargs, prs) if kwargs['size'] == 1: self.log.debug('Getting this PR, with page size 1: %s' % prs[0]) kwargs['size'] = min(MAX_BULK_SIZE, int(kwargs['size'] * AUGMENT) + 1) one = 0 if not get_commits: self.log.info('Will get full commits on next query.') kwargs['pr_query'] = self.get_pr_query( include_commits=get_commits) get_commits = True except RequestTimeout: kwargs['size'] = max(1, kwargs['size'] // REDUCE) if kwargs['size'] == 1: one += 1 if one == MAX_TRY - 1: self.log.info( '%d timeouts in a raw for one pr, retrying without commits.' % (MAX_TRY - 1)) get_commits = False kwargs['pr_query'] = self.get_pr_query( include_commits=get_commits) elif one >= MAX_TRY: self.log.info( '%d timeouts in a raw for one pr, giving up.' % MAX_TRY) raise continue self.log.info("%s PRs fetched" % len(prs)) if hnp is False: break return prs
def get(self, updated_since: str, change_id: Optional[str] = None) -> List[RawChange]: if not change_id: request_params = "?q=after:%s+project:%s" % ( utils.is8601_to_dt(updated_since).strftime("%Y-%m-%d"), self.repository_prefix, ) else: request_params = "?q=change:%s" % change_id for option in [ "MESSAGES", "DETAILED_ACCOUNTS", "DETAILED_LABELS", "CURRENT_REVISION", "CURRENT_FILES", "CURRENT_COMMIT", ]: request_params += "&o=%s" % option count = 100 start_after = 0 reviews = [] while True: urlpath = (self.base_url + "/changes/" + request_params + "&n=%s&start=%s" % (count, start_after)) self.log.info("query: %s" % urlpath) try: response = requests.get(urlpath, verify=not self.insecure, auth=self.auth) response.raise_for_status() except Exception: self.log.exception( "Unable to process the Gerrit query request") break _reviewes = json.loads(response.text[4:]) if _reviewes: reviews.extend(_reviewes) self.log.info("read %s reviews from the api" % len(reviews)) if reviews[-1].get("_more_changes"): start_after = len(reviews) else: break else: break if self.prefix: for review in reviews: review["project"] = self.prefix + review["project"] return reviews
def get(self, updated_since, change=None): if not change: request_params = "?q=after:%s+project:%s" % ( utils.is8601_to_dt(updated_since).strftime("%Y-%m-%d"), self.repository_prefix, ) else: request_params = "?q=change:%s" % change for option in [ 'MESSAGES', 'DETAILED_ACCOUNTS', 'DETAILED_LABELS', 'CURRENT_REVISION', 'CURRENT_FILES', 'CURRENT_COMMIT', ]: request_params += '&o=%s' % option count = 100 start_after = 0 reviews = [] while True: urlpath = ( self.base_url + '/changes/' + request_params + '&n=%s&start=%s' % (count, start_after) ) self.log.info("query: %s" % urlpath) try: response = requests.get( urlpath, verify=not self.insecure, auth=self.auth ) response.raise_for_status() except Exception: self.log.exception('Unable to process the Gerrit query request') break _reviewes = json.loads(response.text[4:]) if _reviewes: reviews.extend(_reviewes) self.log.info("read %s reviews from the api" % len(reviews)) if reviews[-1].get('_more_changes'): start_after = len(reviews) else: break else: break return reviews
def cold_changes(es, index, repository_fullname, params): params = deepcopy(params) size = params.get("size") params["etype"] = ("Change",) params["state"] = ("OPEN",) changes = _scan(es, index, repository_fullname, params) _changes_ids = set([change["change_id"] for change in changes]) params["etype"] = ("ChangeCommentedEvent", "ChangeReviewedEvent") del params["state"] events = _scan(es, index, repository_fullname, params) _events_ids = set([event["change_id"] for event in events]) changes_ids_wo_rc = _changes_ids.difference(_events_ids) changes_wo_rc = [ change for change in changes if change["change_id"] in changes_ids_wo_rc ] changes_wo_rc = enhance_changes(changes_wo_rc) items = sorted(changes_wo_rc, key=lambda x: is8601_to_dt(x["created_at"])) if size: items = items[:size] return {"items": items}
def cold_changes(es, index, repository_fullname, params): params = deepcopy(params) size = params.get('size') params['etype'] = ('Change',) params['state'] = 'OPEN' changes = _scan(es, index, repository_fullname, params) _changes_ids = set([change['change_id'] for change in changes]) params['etype'] = ('ChangeCommentedEvent', 'ChangeReviewedEvent') del params['state'] events = _scan(es, index, repository_fullname, params) _events_ids = set([event['change_id'] for event in events]) changes_ids_wo_rc = _changes_ids.difference(_events_ids) changes_wo_rc = [ change for change in changes if change['change_id'] in changes_ids_wo_rc ] changes_wo_rc = enhance_changes(changes_wo_rc) items = sorted(changes_wo_rc, key=lambda x: is8601_to_dt(x['created_at'])) if size: items = items[:size] return {'items': items}
def timedelta(start, end): start = utils.is8601_to_dt(start) end = utils.is8601_to_dt(end) return int((start - end).total_seconds())