Esempio n. 1
0
    def get_issue_summaries(self, repo_url, baseurl=None, cachefile=None):
        """Return a dict of all issue summaries with numbers as keys

        Adds a compatibility method for the webscraper

        Args:
            repo_url  (str): username/repository
            baseurl   (str): not used
            cachefile (str): not used
        """
        owner = repo_url.split(u'/', 1)[0]
        repo = repo_url.split(u'/', 1)[1]
        summaries = self.get_all_summaries(owner, repo)

        issues = {}
        for x in summaries:
            issues[to_text(x[u'number'])] = x

        # keep the summaries for out of band analysis
        repodata = {
            u'user': repo_url.split(u'/', 1)[0],
            u'repo': repo_url.split(u'/', 1)[1],
        }
        post_to_receiver(u'summaries', repodata, issues)

        return issues
Esempio n. 2
0
    def get_issue_summaries(self, repo_url, baseurl=None, cachefile=None):
        """Return a dict of all issue summaries with numbers as keys

        Adds a compatibility method for the webscraper

        Args:
            repo_url  (str): username/repository
            baseurl   (str): not used
            cachefile (str): not used
        """
        owner = repo_url.split(u'/', 1)[0]
        repo = repo_url.split(u'/', 1)[1]
        summaries = self.get_all_summaries(owner, repo)

        issues = {}
        for x in summaries:
            issues[to_text(x[u'number'])] = x

        # keep the summaries for out of band analysis
        repodata = {
            u'user': repo_url.split(u'/', 1)[0],
            u'repo': repo_url.split(u'/', 1)[1],
        }
        post_to_receiver(u'summaries', repodata, issues)

        return issues
Esempio n. 3
0
    def get_issue_summaries(self, repo_url, baseurl=None, cachefile=None):
        '''Paginate through github's web interface and scrape summaries'''

        # repo_url - https://github.com/ansible/ansible for example
        # baseurl - an entrypoint for one-off utils to scrape specific issue
        #           query urls. NOTE: this disables writing a cache

        # get cached
        if not baseurl:
            issues = self.load_summaries(repo_url)
        else:
            issues = {}

        if not baseurl:
            url = repo_url
            url += '/issues'
            url += '?'
            url += 'q='
            url += urllib2.quote('sort:updated-desc')
        else:
            url = baseurl

        namespace = repo_url.split('/')[-2]
        reponame = repo_url.split('/')[-1]

        rr = self._request_url(url)
        soup = BeautifulSoup(rr.text, 'html.parser')
        data = self._parse_issue_summary_page(soup)
        if data['issues']:
            # send to receiver
            post_to_receiver('html_summaries', {
                'user': namespace,
                'repo': reponame
            }, data['issues'])
            # update master list
            issues.update(data['issues'])

        if not baseurl:
            self.dump_summaries_tmp(repo_url, issues)

        while data['next_page']:
            rr = self._request_url(self.baseurl + data['next_page'])
            soup = BeautifulSoup(rr.text, 'html.parser')
            data = self._parse_issue_summary_page(soup)

            # send to receiver
            post_to_receiver('html_summaries', {
                'user': namespace,
                'repo': reponame
            }, data['issues'])

            if not data['next_page'] or not data['issues']:
                break

            changed = []
            changes = False
            for k, v in data['issues'].iteritems():

                if not isinstance(k, unicode):
                    k = u'%s' % k

                if k not in issues:
                    changed.append(k)
                    changes = True
                elif v != issues[k]:
                    changed.append(k)
                    changes = True
                issues[k] = v

            if changed:
                logging.info('changed: %s' % ','.join(x for x in changed))

            if not baseurl:
                self.dump_summaries_tmp(repo_url, issues)

            if not changes:
                break

        # get missing
        if not baseurl:
            numbers = sorted([int(x) for x in issues.keys()])
            missing = [x for x in xrange(1, numbers[-1]) if x not in numbers]
            for x in missing:
                summary = self.get_single_issue_summary(repo_url,
                                                        x,
                                                        force=True)
                if summary:
                    post_to_receiver('html_summaries', {
                        'user': namespace,
                        'repo': reponame
                    }, {x: summary})
                    if not isinstance(x, unicode):
                        x = u'%s' % x
                    issues[x] = summary

        # get missing timestamps
        if not baseurl:
            numbers = sorted([int(x) for x in issues.keys()])
            missing = [
                x for x in numbers
                if str(x) not in issues or not issues[str(x)]['updated_at']
            ]
            for x in missing:
                summary = self.get_single_issue_summary(repo_url,
                                                        x,
                                                        force=True)
                if summary:
                    post_to_receiver('html_summaries', {
                        'user': namespace,
                        'repo': reponame
                    }, {x: summary})
                    if not isinstance(x, unicode):
                        x = u'%s' % x
                    issues[x] = summary

        # save the cache
        if not baseurl:
            self.dump_summaries(repo_url, issues)

        return issues
Esempio n. 4
0
    def get_issue_summaries(self, repo_url, baseurl=None, cachefile=None):
        '''Paginate through github's web interface and scrape summaries'''

        # repo_url - https://github.com/ansible/ansible for example
        # baseurl - an entrypoint for one-off utils to scrape specific issue
        #           query urls. NOTE: this disables writing a cache

        # get cached
        if not baseurl:
            issues = self.load_summaries(repo_url)
        else:
            issues = {}

        if not baseurl:
            url = repo_url
            url += u'/issues'
            url += u'?'
            url += u'q='
            url += urllib2.quote(u'sort:updated-desc')
        else:
            url = baseurl

        namespace = repo_url.split(u'/')[-2]
        reponame = repo_url.split(u'/')[-1]

        rr = self._request_url(url)
        soup = BeautifulSoup(rr.text, u'html.parser')
        data = self._parse_issue_summary_page(soup)
        if data[u'issues']:
            # send to receiver
            post_to_receiver(u'html_summaries', {u'user': namespace, u'repo': reponame}, data[u'issues'])
            # update master list
            issues.update(data[u'issues'])

        if not baseurl:
            self.dump_summaries_tmp(repo_url, issues)

        while data[u'next_page']:
            rr = self._request_url(self.baseurl + data[u'next_page'])
            soup = BeautifulSoup(rr.text, u'html.parser')
            data = self._parse_issue_summary_page(soup)

            # send to receiver
            post_to_receiver(u'html_summaries', {u'user': namespace, u'repo': reponame}, data[u'issues'])

            if not data[u'next_page'] or not data[u'issues']:
                break

            changed = []
            changes = False
            for k, v in six.iteritems(data[u'issues']):

                if not isinstance(k, unicode):
                    k = u'%s' % k

                if k not in issues:
                    changed.append(k)
                    changes = True
                elif v != issues[k]:
                    changed.append(k)
                    changes = True
                issues[k] = v

            if changed:
                logging.info(u'changed: %s' % u','.join(x for x in changed))

            if not baseurl:
                self.dump_summaries_tmp(repo_url, issues)

            if not changes:
                break

        # get missing
        if not baseurl:
            numbers = sorted([int(x) for x in issues.keys()])
            missing = [x for x in xrange(1, numbers[-1]) if x not in numbers]
            for x in missing:
                summary = self.get_single_issue_summary(repo_url, x, force=True)
                if summary:
                    post_to_receiver(u'html_summaries', {u'user': namespace, u'repo': reponame}, {x: summary})
                    if not isinstance(x, unicode):
                        x = u'%s' % x
                    issues[x] = summary

        # get missing timestamps
        if not baseurl:
            numbers = sorted([int(x) for x in issues.keys()])
            missing = [x for x in numbers if to_text(x) not in issues or not issues[to_text(x)][u'updated_at']]
            for x in missing:
                summary = self.get_single_issue_summary(repo_url, x, force=True)
                if summary:
                    post_to_receiver(u'html_summaries', {u'user': namespace, u'repo': reponame}, {x: summary})
                    if not isinstance(x, unicode):
                        x = u'%s' % x
                    issues[x] = summary

        # save the cache
        if not baseurl:
            self.dump_summaries(repo_url, issues)

        return issues