def test_get_request_rate_limited(): cachedir = tempfile.mkdtemp() gh = GithubMock() gw = GithubWrapper(gh, token=12345, cachedir=cachedir) with pytest.raises(RateLimitError): rdata = gw.get_request('https://foo.bar.com/test')
def test_get_request_rate_limited(): GithubWrapper._connect = lambda *args: None gw = GithubWrapper(token=12345, cachedir=tempfile.mkdtemp()) with pytest.raises(RateLimitError): gw.get_request('https://foo.bar.com/test')
class Scraper: def __init__(self): self.gh = GithubWrapper(None, token=C.DEFAULT_GITHUB_TOKEN) self.cachedir = '/tmp/pings.cache' if not os.path.exists(self.cachedir): os.makedirs(self.cachedir) bylogin = {} byissue = {} numbers = self.get_numbers() for idn, number in enumerate(numbers): logging.info('%s|%s issue %s' % (len(numbers), idn + 1, number)) #if idn < 6000: # continue if idn > 7000: break issue = self.get_issue(number) if 'url' not in issue: continue url = issue['url'] labels = [x['name'] for x in issue['labels']] login = issue['user']['login'] byissue[url] = { 'login': login, 'team': set(), 'mentions': {}, 'mentioned': None, 'responded': None, 'bug': 'bug' in labels, 'feature': 'feature' in labels, 'pull': 'pull' in issue['html_url'] } comments = self.get_comments(number) if not comments: continue for comment in comments: if comment is None: import epdb epdb.st() if comment['user'] is None: #import epdb; epdb.st() continue login = comment['user']['login'] mentions = self.parse_mentions(comment['body']) if mentions: for mention in mentions: byissue[url]['team'].add(mention) if mention not in byissue[url]['mentions']: byissue[url]['mentions'][mention] = { 'mentioned': comment['created_at'], 'responded': None } if comment['created_at'] < byissue[url]['mentions'][ mention]['mentioned']: byissue[url]['mentions'][mention][ 'mentioned'] = comment['created_at'] # team generally mentioned? if byissue[url]['mentioned'] is None or \ byissue[url]['mentioned'] > comment['created_at']: byissue[url]['mentioned'] = comment['created_at'] for comment in comments: if comment is None: import epdb epdb.st() if comment['user'] is None: #import epdb; epdb.st() continue login = comment['user']['login'] if login in byissue[url]['team']: # team generally responded? if byissue[url]['responded'] is None or \ byissue[url]['responded'] > comment['created_at']: byissue[url]['responded'] = comment['created_at'] if byissue[url]['mentions'][mention]['responded'] is None or \ byissue[url]['mentions'][mention]['responded'] > comment['created_at']: byissue[url]['mentions'][mention][ 'responded'] = comment['created_at'] report(byissue) def get_numbers(self): gq_cache_file = os.path.join(self.cachedir, 'gql_cache.json') if not os.path.exists(gq_cache_file): gqlc = GithubGraphQLClient(C.DEFAULT_GITHUB_TOKEN) summaries = gqlc.get_issue_summaries('ansible/ansible') with open(gq_cache_file, 'w') as f: f.write(json.dumps(summaries)) else: with open(gq_cache_file, 'r') as f: summaries = json.loads(f.read()) numbers = set() for k, v in summaries.items(): #if v['state'] != 'open': # continue numbers.add(v['number']) numbers = sorted(numbers, reverse=True) return numbers def get_issue(self, number): issue_url = 'https://api.github.com/repos/ansible/ansible/issues/%s' % number issue = self.get_url(issue_url) return issue def get_comments(self, number): issue_url = 'https://api.github.com/repos/ansible/ansible/issues/%s' % number issue = self.get_url(issue_url) comments_url = 'https://api.github.com/repos/ansible/ansible/issues/%s/comments' % number comments = self.get_url(comments_url) reviews = [] if 'pull' in issue['html_url']: pull = self.get_url(issue['pull_request']['url']) if pull['review_comments'] > 0: reviews = self.get_url(pull['review_comments_url']) return comments + reviews def get_url(self, url): cachedir = os.path.join(self.cachedir, 'requests') if not os.path.exists(cachedir): os.makedirs(cachedir) m = hashlib.md5() m.update(url.encode('utf-8')) digest = m.hexdigest() cachefile = os.path.join(cachedir, '%s.json' % digest) if not os.path.exists(cachefile): data = self.gh.get_request(url) with open(cachefile, 'w') as f: f.write(json.dumps(data)) else: with open(cachefile, 'r') as f: data = json.loads(f.read()) return data def parse_mentions(self, body): mentioned = set() if '@' in body: words = body.split() for word in words: if word.startswith('@'): login = word.replace('@', '') if not login.strip(): continue if '"' in login: continue if "'" in login: continue if '(' in login: continue if ')' in login: continue if '/' in login: continue if '\\' in login: continue if '{' in login: continue login = login.rstrip(',') if login: mentioned.add(login) return list(mentioned)
def main(): logging.level = logging.DEBUG logFormatter = logging.Formatter("%(asctime)s %(levelname)s %(message)s") rootLogger = logging.getLogger() rootLogger.setLevel(logging.DEBUG) consoleHandler = logging.StreamHandler() consoleHandler.setFormatter(logFormatter) rootLogger.addHandler(consoleHandler) summaries = None gq_cache_file = '/tmp/gql_cache.json' if not os.path.exists(gq_cache_file): gqlc = GithubGraphQLClient(C.DEFAULT_GITHUB_TOKEN) summaries = gqlc.get_issue_summaries('ansible/ansible') with open(gq_cache_file, 'w') as f: f.write(json.dumps(summaries)) else: with open(gq_cache_file, 'r') as f: summaries = json.loads(f.read()) numbers = set() for k, v in summaries.items(): if v['state'] != 'open': continue numbers.add(v['number']) numbers = sorted(numbers, reverse=True) gh = GithubWrapper(None, token=C.DEFAULT_GITHUB_TOKEN) for idn, number in enumerate(numbers): logging.info('%s|%s issue %s' % (len(numbers), idn + 1, number)) if number > 52979: continue comments_url = 'https://api.github.com/repos/ansible/ansible/issues/%s/comments' % number comments = gh.get_request(comments_url) duplicates = {} for comment in comments: if comment['user']['login'] != 'ansibot': continue if comment['body'] not in duplicates: duplicates[comment['body']] = [] duplicates[comment['body']].append(comment['id']) if duplicates: topop = [] for k, v in duplicates.items(): if len(v) <= 1: topop.append(k) for tp in topop: duplicates.pop(tp, None) if duplicates: for k, v in duplicates.items(): dupes = [x for x in comments if x['id'] in v] dupes = sorted(dupes, key=lambda x: x['created_at']) pprint([[x['id'], x['body']] for x in dupes]) #if '<!--- boilerplate: notify --->' not in dupes[0]['body']: # continue #import epdb; epdb.st() for dupe in dupes[1:]: gh.delete_request(dupe['url']) time.sleep(1)