Ejemplo n.º 1
0
 def _getPage(self, kwargs, prs):
     qdata = '''{
       repository(owner: "%(org)s", name:"%(repository)s") {
         pullRequests(
           first: %(size)s
           %(after)s
           orderBy: { field: UPDATED_AT, direction: DESC }
         ) {
           totalCount
           pageInfo {
             hasNextPage endCursor
           }
           edges {
             node {
               %(pr_query)s
             }
           }
         }
       }
     }'''  # noqa: E501
     data = self.gql.query(qdata % kwargs)
     if 'data' not in data:
         self.log.error('No data collected: %s' % data)
         if 'message' in data and 'wait a few minutes' in data['message']:
             self.log.info('sleeping 2 mn')
             sleep(120)
         else:
             self.log.info('sleeping 20 s')
             sleep(20)
         return None
     if not kwargs['total_prs_count']:
         kwargs['total_prs_count'] = data['data']['repository']['pullRequests'][
             'totalCount'
         ]
         self.log.info(
             "Total PRs: %s but will fetch until we reached a PR"
             "updated at date < %s"
             % (kwargs['total_prs_count'], kwargs['updated_since'])
         )
         if kwargs['total_prs_count'] == 0:
             return False
     edges = data['data']['repository']['pullRequests']['edges']
     for pr in edges:
         prs.append(pr['node'])
     # We sort to mitigate this
     # https://github.community/t5/GitHub-API-Development-and/apiv4-pullrequests-listing-broken-ordering/m-p/59439#M4968
     oldest_update = sorted(
         [dbdate_to_datetime(pr['node']['updatedAt']) for pr in edges], reverse=True
     )[-1]
     logging.info("page oldest updated at date is %s" % oldest_update)
     if oldest_update < kwargs['updated_since']:
         # The crawler reached a page where the oldest updated PR
         # is oldest than the configured limit
         return False
     pageInfo = data['data']['repository']['pullRequests']['pageInfo']
     if pageInfo['hasNextPage']:
         kwargs['after'] = 'after: "%s"' % pageInfo['endCursor']
         return True
     else:
         return False
Ejemplo n.º 2
0
def _first_event_on_changes(es, index, repository_fullname, params):
    params = deepcopy(params)

    def keyfunc(x):
        return x['change_id']

    groups = {}
    _events = _scan(es, index, repository_fullname, params)
    _events = sorted(_events, key=lambda k: k['change_id'])
    # Keep by Change the created date + first event date
    for pr, events in groupby(_events, keyfunc):
        groups[pr] = {
            'change_created_at': None,
            'first_event_created_at': datetime.now(),
            'first_event_author': None,
            'delta': None,
        }
        for event in events:
            if not groups[pr]['change_created_at']:
                groups[pr]['change_created_at'] = dbdate_to_datetime(
                    event['on_created_at'])
            event_created_at = dbdate_to_datetime(event['created_at'])
            if event_created_at < groups[pr]['first_event_created_at']:
                groups[pr]['first_event_created_at'] = event_created_at
                groups[pr]['delta'] = (groups[pr]['first_event_created_at'] -
                                       groups[pr]['change_created_at'])
                groups[pr]['first_event_author'] = event['author']
    ret = {'first_event_delay_avg': 0, 'top_authors': {}}
    for pr_data in groups.values():
        ret['first_event_delay_avg'] += pr_data['delta'].seconds
        ret['top_authors'].setdefault(pr_data['first_event_author'], 0)
        ret['top_authors'][pr_data['first_event_author']] += 1
    try:
        ret['first_event_delay_avg'] = int(ret['first_event_delay_avg'] /
                                           len(groups))
    except ZeroDivisionError:
        ret['first_event_delay_avg'] = 0
    ret['top_authors'] = sorted(
        [(k, v) for k, v in ret['top_authors'].items()],
        key=lambda x: x[1],
        reverse=True,
    )[:10]
    return ret
Ejemplo n.º 3
0
 def get(self, updated_since):
     prs = []
     if len(updated_since.split('T')) == 1:
         updated_since += 'T00:00:00Z'
     updated_since = dbdate_to_datetime(updated_since)
     get_commits = True
     kwargs = {
         'pr_query': self.get_pr_query(include_commits=get_commits),
         'org': self.org,
         'repository': self.repository,
         'updated_since': updated_since,
         'after': '',
         'total_prs_count': 0,
         'size': self.size,
     }
     one = 0
     while True:
         self.log.info(
             'Running request %s'
             % dict([(k, v) for k, v in kwargs.items() if k != 'pr_query'])
         )
         try:
             hnp = self._getPage(kwargs, prs)
             if kwargs['size'] == 1:
                 self.log.debug('Getting this PR, with page size 1: %s' % prs[0])
             kwargs['size'] = min(MAX_BULK_SIZE, int(kwargs['size'] * AUGMENT) + 1)
             one = 0
             if not get_commits:
                 self.log.info('Will get full commits on next query.')
                 kwargs['pr_query'] = self.get_pr_query(include_commits=get_commits)
                 get_commits = True
         except RequestTimeout:
             kwargs['size'] = max(1, kwargs['size'] // REDUCE)
             if kwargs['size'] == 1:
                 one += 1
                 if one == MAX_TRY - 1:
                     self.log.info(
                         '%d timeouts in a raw for one pr, retrying without commits.'
                         % (MAX_TRY - 1)
                     )
                     get_commits = False
                     kwargs['pr_query'] = self.get_pr_query(
                         include_commits=get_commits
                     )
                 elif one >= MAX_TRY:
                     self.log.info(
                         '%d timeouts in a raw for one pr, giving up.' % MAX_TRY
                     )
                     raise
             continue
         self.log.info("%s PRs fetched" % len(prs))
         if hnp is False:
             break
     return prs
Ejemplo n.º 4
0
 def run_named_query(self, name, *args, **kwargs):
     # Here we set gte and gte if not provided by user
     # especially to be able to set the histogram extended_bounds
     if not args[1].get('gte'):
         args[1]['gte'] = int(
             utils.dbdate_to_datetime(
                 queries._first_created_event(self.es, self.index, *args, **
                                              kwargs)).timestamp() * 1000)
     if not args[1].get('lte'):
         args[1]['lte'] = int(datetime.now().timestamp() * 1000)
     return getattr(queries, name)(self.es, self.index, *args, **kwargs)
Ejemplo n.º 5
0
Archivo: db.py Proyecto: mhuin/monocle
 def run_named_query(self, name, *args, **kwargs):
     # Here we set gte and lte if not provided by user
     # especially to be able to set the histogram extended_bounds
     if name not in queries.public_queries:
         raise UnknownQueryException("Unknown query: %s" % name)
     if not args[1].get('gte'):
         first_created_event = queries._first_created_event(
             self.es, self.index, *args, **kwargs)
         if first_created_event:
             args[1]['gte'] = int(
                 utils.dbdate_to_datetime(first_created_event).timestamp() *
                 1000)
         else:
             # There is probably nothing the db that match the query
             args[1]['gte'] = None
     if not args[1].get('lte'):
         args[1]['lte'] = int(datetime.now().timestamp() * 1000)
     return getattr(queries, name)(self.es, self.index, *args, **kwargs)
Ejemplo n.º 6
0
def cold_changes(es, index, repository_fullname, params):
    params['etype'] = ('Change', )
    params['state'] = 'OPEN'
    changes = _scan(es, index, repository_fullname, params)
    _changes_ids = set(
        [change['repository_fullname_and_number'] for change in changes])
    params['etype'] = ('ChangeCommentedEvent', 'ChangeReviewedEvent')
    del params['state']
    events = _scan(es, index, repository_fullname, params)
    _events_ids = set(
        [event['repository_fullname_and_number'] for event in events])
    changes_ids_wo_rc = _changes_ids.difference(_events_ids)
    changes_wo_rc = [
        change for change in changes
        if change['repository_fullname_and_number'] in changes_ids_wo_rc
    ]
    return sorted(changes_wo_rc,
                  key=lambda x: dbdate_to_datetime(x['created_at']))
Ejemplo n.º 7
0
def cold_changes(es, index, repository_fullname, params):
    params = deepcopy(params)
    size = params.get('size')
    params['etype'] = ('Change', )
    params['state'] = 'OPEN'
    changes = _scan(es, index, repository_fullname, params)
    _changes_ids = set([change['change_id'] for change in changes])
    params['etype'] = ('ChangeCommentedEvent', 'ChangeReviewedEvent')
    del params['state']
    events = _scan(es, index, repository_fullname, params)
    _events_ids = set([event['change_id'] for event in events])
    changes_ids_wo_rc = _changes_ids.difference(_events_ids)
    changes_wo_rc = [
        change for change in changes
        if change['change_id'] in changes_ids_wo_rc
    ]
    changes_wo_rc = enhance_changes(changes_wo_rc)
    items = sorted(changes_wo_rc,
                   key=lambda x: dbdate_to_datetime(x['created_at']))
    if size:
        items = items[:size]
    return {'items': items}