def test_get_request_rate_limited():

    cachedir = tempfile.mkdtemp()

    gh = GithubMock()
    gw = GithubWrapper(gh, token=12345, cachedir=cachedir)

    with pytest.raises(RateLimitError):
        rdata = gw.get_request('https://foo.bar.com/test')
Beispiel #2
0
def test_get_request_rate_limited():
    GithubWrapper._connect = lambda *args: None
    gw = GithubWrapper(token=12345, cachedir=tempfile.mkdtemp())

    with pytest.raises(RateLimitError):
        gw.get_request('https://foo.bar.com/test')
Beispiel #3
0
class Scraper:
    def __init__(self):
        self.gh = GithubWrapper(None, token=C.DEFAULT_GITHUB_TOKEN)

        self.cachedir = '/tmp/pings.cache'
        if not os.path.exists(self.cachedir):
            os.makedirs(self.cachedir)

        bylogin = {}
        byissue = {}

        numbers = self.get_numbers()
        for idn, number in enumerate(numbers):

            logging.info('%s|%s issue %s' % (len(numbers), idn + 1, number))

            #if idn < 6000:
            #    continue

            if idn > 7000:
                break

            issue = self.get_issue(number)

            if 'url' not in issue:
                continue

            url = issue['url']
            labels = [x['name'] for x in issue['labels']]
            login = issue['user']['login']

            byissue[url] = {
                'login': login,
                'team': set(),
                'mentions': {},
                'mentioned': None,
                'responded': None,
                'bug': 'bug' in labels,
                'feature': 'feature' in labels,
                'pull': 'pull' in issue['html_url']
            }

            comments = self.get_comments(number)
            if not comments:
                continue

            for comment in comments:

                if comment is None:
                    import epdb
                    epdb.st()
                if comment['user'] is None:
                    #import epdb; epdb.st()
                    continue

                login = comment['user']['login']
                mentions = self.parse_mentions(comment['body'])

                if mentions:
                    for mention in mentions:
                        byissue[url]['team'].add(mention)

                        if mention not in byissue[url]['mentions']:
                            byissue[url]['mentions'][mention] = {
                                'mentioned': comment['created_at'],
                                'responded': None
                            }
                        if comment['created_at'] < byissue[url]['mentions'][
                                mention]['mentioned']:
                            byissue[url]['mentions'][mention][
                                'mentioned'] = comment['created_at']

                    # team generally mentioned?
                    if byissue[url]['mentioned'] is None or \
                            byissue[url]['mentioned'] > comment['created_at']:
                        byissue[url]['mentioned'] = comment['created_at']

            for comment in comments:
                if comment is None:
                    import epdb
                    epdb.st()
                if comment['user'] is None:
                    #import epdb; epdb.st()
                    continue

                login = comment['user']['login']

                if login in byissue[url]['team']:
                    # team generally responded?
                    if byissue[url]['responded'] is None or \
                            byissue[url]['responded'] > comment['created_at']:
                        byissue[url]['responded'] = comment['created_at']

                    if byissue[url]['mentions'][mention]['responded'] is None or \
                            byissue[url]['mentions'][mention]['responded'] > comment['created_at']:
                        byissue[url]['mentions'][mention][
                            'responded'] = comment['created_at']

        report(byissue)

    def get_numbers(self):
        gq_cache_file = os.path.join(self.cachedir, 'gql_cache.json')

        if not os.path.exists(gq_cache_file):
            gqlc = GithubGraphQLClient(C.DEFAULT_GITHUB_TOKEN)
            summaries = gqlc.get_issue_summaries('ansible/ansible')
            with open(gq_cache_file, 'w') as f:
                f.write(json.dumps(summaries))
        else:
            with open(gq_cache_file, 'r') as f:
                summaries = json.loads(f.read())

        numbers = set()
        for k, v in summaries.items():
            #if v['state'] != 'open':
            #    continue
            numbers.add(v['number'])
        numbers = sorted(numbers, reverse=True)
        return numbers

    def get_issue(self, number):
        issue_url = 'https://api.github.com/repos/ansible/ansible/issues/%s' % number
        issue = self.get_url(issue_url)
        return issue

    def get_comments(self, number):
        issue_url = 'https://api.github.com/repos/ansible/ansible/issues/%s' % number
        issue = self.get_url(issue_url)
        comments_url = 'https://api.github.com/repos/ansible/ansible/issues/%s/comments' % number
        comments = self.get_url(comments_url)

        reviews = []
        if 'pull' in issue['html_url']:
            pull = self.get_url(issue['pull_request']['url'])
            if pull['review_comments'] > 0:
                reviews = self.get_url(pull['review_comments_url'])

        return comments + reviews

    def get_url(self, url):
        cachedir = os.path.join(self.cachedir, 'requests')
        if not os.path.exists(cachedir):
            os.makedirs(cachedir)
        m = hashlib.md5()
        m.update(url.encode('utf-8'))
        digest = m.hexdigest()

        cachefile = os.path.join(cachedir, '%s.json' % digest)
        if not os.path.exists(cachefile):
            data = self.gh.get_request(url)
            with open(cachefile, 'w') as f:
                f.write(json.dumps(data))
        else:
            with open(cachefile, 'r') as f:
                data = json.loads(f.read())

        return data

    def parse_mentions(self, body):

        mentioned = set()

        if '@' in body:
            words = body.split()
            for word in words:
                if word.startswith('@'):
                    login = word.replace('@', '')
                    if not login.strip():
                        continue
                    if '"' in login:
                        continue
                    if "'" in login:
                        continue
                    if '(' in login:
                        continue
                    if ')' in login:
                        continue
                    if '/' in login:
                        continue
                    if '\\' in login:
                        continue
                    if '{' in login:
                        continue
                    login = login.rstrip(',')

                    if login:
                        mentioned.add(login)

        return list(mentioned)
Beispiel #4
0
def main():

    logging.level = logging.DEBUG
    logFormatter = logging.Formatter("%(asctime)s %(levelname)s %(message)s")
    rootLogger = logging.getLogger()
    rootLogger.setLevel(logging.DEBUG)
    consoleHandler = logging.StreamHandler()
    consoleHandler.setFormatter(logFormatter)
    rootLogger.addHandler(consoleHandler)

    summaries = None
    gq_cache_file = '/tmp/gql_cache.json'

    if not os.path.exists(gq_cache_file):
        gqlc = GithubGraphQLClient(C.DEFAULT_GITHUB_TOKEN)
        summaries = gqlc.get_issue_summaries('ansible/ansible')
        with open(gq_cache_file, 'w') as f:
            f.write(json.dumps(summaries))
    else:
        with open(gq_cache_file, 'r') as f:
            summaries = json.loads(f.read())

    numbers = set()
    for k, v in summaries.items():
        if v['state'] != 'open':
            continue
        numbers.add(v['number'])
    numbers = sorted(numbers, reverse=True)

    gh = GithubWrapper(None, token=C.DEFAULT_GITHUB_TOKEN)

    for idn, number in enumerate(numbers):
        logging.info('%s|%s issue %s' % (len(numbers), idn + 1, number))

        if number > 52979:
            continue

        comments_url = 'https://api.github.com/repos/ansible/ansible/issues/%s/comments' % number
        comments = gh.get_request(comments_url)

        duplicates = {}
        for comment in comments:
            if comment['user']['login'] != 'ansibot':
                continue
            if comment['body'] not in duplicates:
                duplicates[comment['body']] = []
            duplicates[comment['body']].append(comment['id'])

        if duplicates:
            topop = []
            for k, v in duplicates.items():
                if len(v) <= 1:
                    topop.append(k)
            for tp in topop:
                duplicates.pop(tp, None)

            if duplicates:
                for k, v in duplicates.items():
                    dupes = [x for x in comments if x['id'] in v]
                    dupes = sorted(dupes, key=lambda x: x['created_at'])

                    pprint([[x['id'], x['body']] for x in dupes])

                    #if '<!--- boilerplate: notify --->' not in dupes[0]['body']:
                    #    continue

                    #import epdb; epdb.st()

                    for dupe in dupes[1:]:
                        gh.delete_request(dupe['url'])
                    time.sleep(1)