def _getPage(self, kwargs, prs): qdata = '''{ repository(owner: "%(org)s", name:"%(repository)s") { pullRequests( first: %(size)s %(after)s orderBy: { field: UPDATED_AT, direction: DESC } ) { totalCount pageInfo { hasNextPage endCursor } edges { node { %(pr_query)s } } } } }''' # noqa: E501 data = self.gql.query(qdata % kwargs) if 'data' not in data: self.log.error('No data collected: %s' % data) if 'message' in data and 'wait a few minutes' in data['message']: self.log.info('sleeping 2 mn') sleep(120) else: self.log.info('sleeping 20 s') sleep(20) return None if not kwargs['total_prs_count']: kwargs['total_prs_count'] = data['data']['repository']['pullRequests'][ 'totalCount' ] self.log.info( "Total PRs: %s but will fetch until we reached a PR" "updated at date < %s" % (kwargs['total_prs_count'], kwargs['updated_since']) ) if kwargs['total_prs_count'] == 0: return False edges = data['data']['repository']['pullRequests']['edges'] for pr in edges: prs.append(pr['node']) # We sort to mitigate this # https://github.community/t5/GitHub-API-Development-and/apiv4-pullrequests-listing-broken-ordering/m-p/59439#M4968 oldest_update = sorted( [dbdate_to_datetime(pr['node']['updatedAt']) for pr in edges], reverse=True )[-1] logging.info("page oldest updated at date is %s" % oldest_update) if oldest_update < kwargs['updated_since']: # The crawler reached a page where the oldest updated PR # is oldest than the configured limit return False pageInfo = data['data']['repository']['pullRequests']['pageInfo'] if pageInfo['hasNextPage']: kwargs['after'] = 'after: "%s"' % pageInfo['endCursor'] return True else: return False
def _first_event_on_changes(es, index, repository_fullname, params): params = deepcopy(params) def keyfunc(x): return x['change_id'] groups = {} _events = _scan(es, index, repository_fullname, params) _events = sorted(_events, key=lambda k: k['change_id']) # Keep by Change the created date + first event date for pr, events in groupby(_events, keyfunc): groups[pr] = { 'change_created_at': None, 'first_event_created_at': datetime.now(), 'first_event_author': None, 'delta': None, } for event in events: if not groups[pr]['change_created_at']: groups[pr]['change_created_at'] = dbdate_to_datetime( event['on_created_at']) event_created_at = dbdate_to_datetime(event['created_at']) if event_created_at < groups[pr]['first_event_created_at']: groups[pr]['first_event_created_at'] = event_created_at groups[pr]['delta'] = (groups[pr]['first_event_created_at'] - groups[pr]['change_created_at']) groups[pr]['first_event_author'] = event['author'] ret = {'first_event_delay_avg': 0, 'top_authors': {}} for pr_data in groups.values(): ret['first_event_delay_avg'] += pr_data['delta'].seconds ret['top_authors'].setdefault(pr_data['first_event_author'], 0) ret['top_authors'][pr_data['first_event_author']] += 1 try: ret['first_event_delay_avg'] = int(ret['first_event_delay_avg'] / len(groups)) except ZeroDivisionError: ret['first_event_delay_avg'] = 0 ret['top_authors'] = sorted( [(k, v) for k, v in ret['top_authors'].items()], key=lambda x: x[1], reverse=True, )[:10] return ret
def get(self, updated_since): prs = [] if len(updated_since.split('T')) == 1: updated_since += 'T00:00:00Z' updated_since = dbdate_to_datetime(updated_since) get_commits = True kwargs = { 'pr_query': self.get_pr_query(include_commits=get_commits), 'org': self.org, 'repository': self.repository, 'updated_since': updated_since, 'after': '', 'total_prs_count': 0, 'size': self.size, } one = 0 while True: self.log.info( 'Running request %s' % dict([(k, v) for k, v in kwargs.items() if k != 'pr_query']) ) try: hnp = self._getPage(kwargs, prs) if kwargs['size'] == 1: self.log.debug('Getting this PR, with page size 1: %s' % prs[0]) kwargs['size'] = min(MAX_BULK_SIZE, int(kwargs['size'] * AUGMENT) + 1) one = 0 if not get_commits: self.log.info('Will get full commits on next query.') kwargs['pr_query'] = self.get_pr_query(include_commits=get_commits) get_commits = True except RequestTimeout: kwargs['size'] = max(1, kwargs['size'] // REDUCE) if kwargs['size'] == 1: one += 1 if one == MAX_TRY - 1: self.log.info( '%d timeouts in a raw for one pr, retrying without commits.' % (MAX_TRY - 1) ) get_commits = False kwargs['pr_query'] = self.get_pr_query( include_commits=get_commits ) elif one >= MAX_TRY: self.log.info( '%d timeouts in a raw for one pr, giving up.' % MAX_TRY ) raise continue self.log.info("%s PRs fetched" % len(prs)) if hnp is False: break return prs
def run_named_query(self, name, *args, **kwargs): # Here we set gte and gte if not provided by user # especially to be able to set the histogram extended_bounds if not args[1].get('gte'): args[1]['gte'] = int( utils.dbdate_to_datetime( queries._first_created_event(self.es, self.index, *args, ** kwargs)).timestamp() * 1000) if not args[1].get('lte'): args[1]['lte'] = int(datetime.now().timestamp() * 1000) return getattr(queries, name)(self.es, self.index, *args, **kwargs)
def run_named_query(self, name, *args, **kwargs): # Here we set gte and lte if not provided by user # especially to be able to set the histogram extended_bounds if name not in queries.public_queries: raise UnknownQueryException("Unknown query: %s" % name) if not args[1].get('gte'): first_created_event = queries._first_created_event( self.es, self.index, *args, **kwargs) if first_created_event: args[1]['gte'] = int( utils.dbdate_to_datetime(first_created_event).timestamp() * 1000) else: # There is probably nothing the db that match the query args[1]['gte'] = None if not args[1].get('lte'): args[1]['lte'] = int(datetime.now().timestamp() * 1000) return getattr(queries, name)(self.es, self.index, *args, **kwargs)
def cold_changes(es, index, repository_fullname, params): params['etype'] = ('Change', ) params['state'] = 'OPEN' changes = _scan(es, index, repository_fullname, params) _changes_ids = set( [change['repository_fullname_and_number'] for change in changes]) params['etype'] = ('ChangeCommentedEvent', 'ChangeReviewedEvent') del params['state'] events = _scan(es, index, repository_fullname, params) _events_ids = set( [event['repository_fullname_and_number'] for event in events]) changes_ids_wo_rc = _changes_ids.difference(_events_ids) changes_wo_rc = [ change for change in changes if change['repository_fullname_and_number'] in changes_ids_wo_rc ] return sorted(changes_wo_rc, key=lambda x: dbdate_to_datetime(x['created_at']))
def cold_changes(es, index, repository_fullname, params): params = deepcopy(params) size = params.get('size') params['etype'] = ('Change', ) params['state'] = 'OPEN' changes = _scan(es, index, repository_fullname, params) _changes_ids = set([change['change_id'] for change in changes]) params['etype'] = ('ChangeCommentedEvent', 'ChangeReviewedEvent') del params['state'] events = _scan(es, index, repository_fullname, params) _events_ids = set([event['change_id'] for event in events]) changes_ids_wo_rc = _changes_ids.difference(_events_ids) changes_wo_rc = [ change for change in changes if change['change_id'] in changes_ids_wo_rc ] changes_wo_rc = enhance_changes(changes_wo_rc) items = sorted(changes_wo_rc, key=lambda x: dbdate_to_datetime(x['created_at'])) if size: items = items[:size] return {'items': items}