def _get(cls, query): q = query.replace('\n', '') m = Database.session.query(API).filter(API.query == query).scalar() if m is None: log('cache not exist', q) raise NotExist else: return m
def all(cls, login, repositories): for r in repositories: c = Contribution(login, r) c.validate() log('contribution all <{}> <{}> <{}> <{}> <{}> <{}>'.format( login, r.name_with_owner, c.valid, c.star, c.commit_parts, c.star_pats)) if c.valid: yield c
def _set(cls, query, response): log('set result for query', query) now = int(time.time()) c = API( query=query, response=response, unixtime=now, ) Database.session.merge(c) Database.session.commit()
def main(): start = time.time() init_db() us = all_data() log_data(us) generate_html(us) end = time.time() log('total time cost {} seconds'.format(end - start))
def generate_html(users): template = 'template_rank.html' timezone = datetime.timezone(datetime.timedelta(hours=8)) now = datetime.datetime.now(timezone) html = Template.render(template, updated=str(now), users=users[:1000]) filename = 'index.html' path = os.path.join(config.static, filename) with open(path, 'w', encoding='utf-8') as f: f.write(html) log('finish generate html, length {}'.format(len(html)))
def get_crawler(cls, query): log('get_crawler', query) try: m = cls._get(query) except NotExist: return cls._get_crawler(query) else: if cls._valid_cache(m): return m.response else: return cls._get_crawler(query)
def validate(self): # language may be none for some repo due to none files or other reason if self.language is None or self.language in config.invalid_language or self.total_star == 0: self.valid = False self.all_invalid.append( (self.name_with_owner, self.total_star, self.language)) elif not self.valid_name_and_description(): self.valid = False self.all_invalid.append((self.name_with_owner, self.total_star, self.name_with_owner, self.description)) elif not self.valid_code_files(): self.valid = False self.all_invalid.append( (self.name_with_owner, self.total_star, self.files)) else: self.valid = True log('repository.validate <{}> <{}> <{}> <{}>'.format( self.name_with_owner, self.total_star, self.valid, self.files))
def get_v3(cls, query): log('get_v3', query) try: m = cls._get(query) except NotExist: try: return cls._get_v3(query) except ErrorCode202: time.sleep(5) return cls.get_v3(query) else: if cls._valid_cache(m): r = json.loads(m.response) return r else: try: return cls._get_v3(query) except ErrorCode202: r = json.loads(m.response) return r
def get_v4_connection(cls, query, keyword, parameter, format_mapping): log('get_v4_connection', query, parameter) q = cls._query_for_connection(query, parameter, format_mapping) r = cls._get_v4_cache(q) c = cls._connection_for_keyword(r['data'], keyword) edges = c['edges'] yield edges should_continue = True while should_continue: end_cursor = c['pageInfo']['endCursor'] has_next_page = c['pageInfo']['hasNextPage'] if end_cursor is not None or has_next_page: parameter['after'] = end_cursor q = cls._query_for_connection(query, parameter, format_mapping) r = cls._get_v4_cache(q) c = cls._connection_for_keyword(r['data'], keyword) edges = c['edges'] should_continue = yield edges else: return
def all(cls): u2 = cls.users_for_extra() u1 = cls.users_for_query() us = list(u2) + list(u1) seen = set() for i, u in enumerate(us): if u.login not in seen and u.login not in config.block_user: seen.add(u.login) log('start user no.{} {} {}'.format(i, u.login, len(u.repositories))) cs = Contribution.all(u.login, u.repositories) u.contribution = sorted(cs, key=lambda c: c.star, reverse=True) u.star = sum([c.star for c in u.contribution]) if u.star > 0: ls = {} for c in u.contribution: k = c.repository.language ls[k] = ls.get(k, 0) + c.star u.language = sorted(ls.items(), key=lambda l: l[1], reverse=True) yield u log('end user no.{} {} {}'.format(i, u.login, u.language))
def _get_v4(cls, query, cache=True): full_query = f""" {{ rateLimit {{ limit cost remaining resetAt }} {query} }} """ url = 'https://api.github.com/graphql' json_query = {'query': full_query} headers = {'Authorization': 'bearer {}'.format(secret.token)} r = requests.post(url=url, json=json_query, headers=headers) if r.status_code == 200: j = r.json() cls.ensure_not_none(j, f'query <{query}> result is <{j}>') if 'errors' in j: for e in j['errors']: if e['type'] == 'RATE_LIMITED': j_rate = cls._get_v4('', cache=False) limit, remaining, cost, reset_at, reset_in = cls._rate_v4( j_rate) log('v4 query <{}> rate limit <{}> remaing <{}> cost <{}> resetAt <{}> reset_in <{}>' .format(query, limit, remaining, cost, reset_at, reset_in)) # +3 to ensure log('v4 sleep <{}> and try again <{}>'.format( reset_in, query)) time.sleep(reset_in + 3) log('v4 finish sleep <{}>'.format(query)) return cls._get_v4(query) raise GraphQLError(full_query, j['errors']) else: limit, remaining, cost, reset_at, reset_in = cls._rate_v4(j) log('v4 query <{}> rate limit <{}> remaing <{}> cost <{}> resetAt <{}> reset_in <{}>' .format(query, limit, remaining, cost, reset_at, reset_in)) if cache: cls._set(query, r.text) return j else: raise ErrorCode(r.status_code, query)
def _get_v3(cls, query, cache=True): base = 'https://api.github.com' url = '{}{}'.format(base, query) headers = {'Authorization': 'bearer {}'.format(secret.token)} r = requests.get(url=url, headers=headers) if r.status_code == 200: rate_limit, rate_remaing, rate_reset, reset_in = cls._rate_v3(r) log('v3 rate limit <{}> rate remaing <{}> rate reset <{}> reset in <{}>' .format( rate_limit, rate_remaing, rate_reset, reset_in, )) j = r.json() cls.ensure_not_none(j, f'query <{query}> result is <{j}>') if cache: cls._set(query, r.text) return j elif r.status_code == 202: raise ErrorCode202(202, query) # don't knwo when rate will be 0, so compare with 3 elif r.status_code == 403: rate_limit, rate_remaing, rate_reset, reset_in = cls._rate_v3(r) log('v3 rate limit <{}> rate remaing <{}> rate reset <{}> reset in <{}>' .format( rate_limit, rate_remaing, rate_reset, reset_in, )) if rate_remaing == 0: # +3 to ensure log('v3 sleep <{}> and try again <{}>'.format(reset_in, query)) time.sleep(reset_in + 3) log('v3 finish sleep <{}>'.format(query)) else: raise ErrorCode(r.status_code, query) else: raise ErrorCode(r.status_code, query)
def get_v4_object(cls, query): log('get_v4_object', query) return cls._get_v4_cache(query)
def log_data(users): for r in Repository.all_invalid: log('invalid repository', r) for c in Contribution.all_invalid: log('wrong contribution', c) for i, u in enumerate(users): # if len(u.contribution) > 0 and u.login not in u.contribution[0].repository.name_with_owner: formatted = 'user star:' formatted += f'{i:3} {u.login:15} {u.star:5} ' for c in u.contribution[:3]: if c.star > 0: r = c.repository formatted += f'{r.name_with_owner:40} {r.language:12} {c.star:5} ' log(formatted) language = {} for u in users: for l in u.language: if l[0] in language: language[l[0]].append((u.login, l[1])) else: language[l[0]] = [(u.login, l[1])] for k, v in language.items(): log(k) log(sorted(v, key=lambda s: s[1], reverse=True)) log('finish log data to stdout')