Example #1
0
def scrape_dotfiles_repos(num):
    """Scrape at most num dotfiles repos from GitHub for references to Vim
    plugin repos.

    We perform a search on GitHub repositories that are likely to contain
    Vundle and Pathogen bundles instead of a code search matching
    Vundle/Pathogen commands (which has higher precision and recall), because
    GitHub's API requires code search to be limited to
    a user/repo/organization. :(
    """
    # Earliest allowable updated date to start scraping from (so we won't be
    # scraping repos that were last pushed before this date).
    EARLIEST_PUSHED_DATE = datetime.datetime(2013, 1, 1)

    repos_scraped = 0
    scraped_counter = collections.Counter()

    for repo_name in _DOTFILE_REPO_NAMES:
        latest_repo = DotfilesGithubRepos.get_latest_with_keyword(repo_name)

        if latest_repo and latest_repo.get('pushed_at'):
            last_pushed_date = max(
                datetime.datetime.utcfromtimestamp(latest_repo['pushed_at']),
                EARLIEST_PUSHED_DATE)
        else:
            last_pushed_date = EARLIEST_PUSHED_DATE

        # We're going to scrape all repos updated after the latest updated repo
        # in our DB, starting with the least recently updated.  This maintains
        # the invariant that we have scraped all repos pushed before the latest
        # push date (and after EARLIEST_PUSHED_DATE).
        while True:

            start_date_iso = last_pushed_date.isoformat()
            search_params = {
                'q': '%s in:name pushed:>%s' % (repo_name, start_date_iso),
                'sort': 'updated',
                'order': 'asc',
            }

            per_page = 100
            response, search_data = get_api_page('search/repositories',
                                                 query_params=search_params,
                                                 page=1,
                                                 per_page=per_page)

            items = search_data.get('items', [])
            for item in items:
                try:
                    stats = _get_plugin_repos_from_dotfiles(item, repo_name)
                except ApiRateLimitExceededError:
                    logging.exception('API rate limit exceeded.')
                    return repos_scraped, scraped_counter
                except Exception:
                    logging.exception('Error scraping dotfiles repo %s' %
                                      item['full_name'])
                    stats = {}

                scraped_counter.update(stats)

                # If we've scraped the number repos desired, we can quit.
                repos_scraped += 1
                if repos_scraped >= num:
                    return repos_scraped, scraped_counter

            # If we're about to exceed the rate limit (20 requests / min),
            # sleep until the limit resets.
            maybe_wait_until_api_limit_resets(response.headers)

            # If we've scraped all repos with this name, move on to the next
            # repo name.
            if len(items) < per_page:
                break
            else:
                last_pushed_date = dateutil.parser.parse(
                    items[-1]['pushed_at'])

    return repos_scraped, scraped_counter
Example #2
0
def scrape_dotfiles_repos(num):
    """Scrape at most num dotfiles repos from GitHub for references to Vim
    plugin repos.

    We perform a search on GitHub repositories that are likely to contain
    Vundle and Pathogen bundles instead of a code search matching
    Vundle/Pathogen commands (which has higher precision and recall), because
    GitHub's API requires code search to be limited to
    a user/repo/organization. :(
    """
    # Earliest allowable updated date to start scraping from (so we won't be
    # scraping repos that were last pushed before this date).
    EARLIEST_PUSHED_DATE = datetime.datetime(2013, 1, 1)

    repos_scraped = 0
    scraped_counter = collections.Counter()

    for repo_name in _DOTFILE_REPO_NAMES:
        latest_repo = DotfilesGithubRepos.get_latest_with_keyword(repo_name)

        if latest_repo and latest_repo.get('pushed_at'):
            last_pushed_date = max(datetime.datetime.utcfromtimestamp(
                    latest_repo['pushed_at']), EARLIEST_PUSHED_DATE)
        else:
            last_pushed_date = EARLIEST_PUSHED_DATE

        # We're going to scrape all repos updated after the latest updated repo
        # in our DB, starting with the least recently updated.  This maintains
        # the invariant that we have scraped all repos pushed before the latest
        # push date (and after EARLIEST_PUSHED_DATE).
        while True:

            start_date_iso = last_pushed_date.isoformat()
            search_params = {
                'q': '%s in:name pushed:>%s' % (repo_name, start_date_iso),
                'sort': 'updated',
                'order': 'asc',
            }

            per_page = 100
            response, search_data = get_api_page('search/repositories',
                    query_params=search_params, page=1, per_page=per_page)

            items = search_data.get('items', [])
            for item in items:
                try:
                    stats = _get_plugin_repos_from_dotfiles(item, repo_name)
                except ApiRateLimitExceededError:
                    logging.exception('API rate limit exceeded.')
                    return repos_scraped, scraped_counter
                except Exception:
                    logging.exception('Error scraping dotfiles repo %s' %
                            item['full_name'])
                    stats = {}

                scraped_counter.update(stats)

                # If we've scraped the number repos desired, we can quit.
                repos_scraped += 1
                if repos_scraped >= num:
                    return repos_scraped, scraped_counter

            # If we're about to exceed the rate limit (20 requests / min),
            # sleep until the limit resets.
            maybe_wait_until_api_limit_resets(response.headers)

            # If we've scraped all repos with this name, move on to the next
            # repo name.
            if len(items) < per_page:
                break
            else:
                last_pushed_date = dateutil.parser.parse(
                        items[-1]['pushed_at'])

    return repos_scraped, scraped_counter