Esempio n. 1
0
def _get_plugin_repos_from_dotfiles(repo_data, search_keyword):
    """Search for references to vim plugin repos from a dotfiles repository,
    and insert them into DB.

    Arguments:
        repo_data: API response from GitHub of a repository.
        search_keyword: The keyword used that found this repo.
    """
    owner_repo = repo_data['full_name']

    # Print w/o newline.
    print "    scraping %s ..." % owner_repo,
    sys.stdout.flush()

    res, contents_data = get_api_page('repos/%s/contents' % owner_repo)

    if res.status_code == 404 or not isinstance(contents_data, list):
        print "contents not found"
        return

    repos_by_manager = _extract_bundle_repos_from_dir(contents_data)
    vundle_repos = repos_by_manager.vundle
    neobundle_repos = repos_by_manager.neobundle
    vimplug_repos = repos_by_manager.vimplug

    pathogen_repos = _extract_pathogen_repos(contents_data)

    owner, repo_name = owner_repo.split('/')
    db_repo = DotfilesGithubRepos.get_with_owner_repo(owner, repo_name)
    pushed_date = dateutil.parser.parse(repo_data['pushed_at'])

    def stringify_repo(owner_repo_tuple):
        return '/'.join(owner_repo_tuple)

    repo = dict(
        db_repo or {}, **{
            'owner': owner,
            'pushed_at': util.to_timestamp(pushed_date),
            'repo_name': repo_name,
            'search_keyword': search_keyword,
            'vundle_repos': map(stringify_repo, vundle_repos),
            'neobundle_repos': map(stringify_repo, neobundle_repos),
            'vimplug_repos': map(stringify_repo, vimplug_repos),
            'pathogen_repos': map(stringify_repo, pathogen_repos),
        })

    DotfilesGithubRepos.log_scrape(repo)
    DotfilesGithubRepos.upsert_with_owner_repo(repo)

    print 'found %s Vundles, %s NeoBundles, %s VimPlugs, %s Pathogens' % (
        len(vundle_repos), len(neobundle_repos), len(vimplug_repos),
        len(pathogen_repos))

    return {
        'vundle_repos_count': len(vundle_repos),
        'neobundle_repos_count': len(neobundle_repos),
        'vimplug_repos_count': len(vimplug_repos),
        'pathogen_repos_count': len(pathogen_repos),
    }
Esempio n. 2
0
def _get_plugin_repos_from_dotfiles(repo_data, search_keyword):
    """Search for references to vim plugin repos from a dotfiles repository,
    and insert them into DB.

    Arguments:
        repo_data: API response from GitHub of a repository.
        search_keyword: The keyword used that found this repo.
    """
    owner_repo = repo_data['full_name']

    # Print w/o newline.
    print "    scraping %s ..." % owner_repo,
    sys.stdout.flush()

    res, contents_data = get_api_page('repos/%s/contents' % owner_repo)

    if res.status_code == 404 or not isinstance(contents_data, list):
        print "contents not found"
        return

    repos_by_manager = _extract_bundle_repos_from_dir(contents_data)
    vundle_repos = repos_by_manager.vundle
    neobundle_repos = repos_by_manager.neobundle
    vimplug_repos = repos_by_manager.vimplug

    pathogen_repos = _extract_pathogen_repos(contents_data)

    owner, repo_name = owner_repo.split('/')
    db_repo = DotfilesGithubRepos.get_with_owner_repo(owner, repo_name)
    pushed_date = dateutil.parser.parse(repo_data['pushed_at'])

    def stringify_repo(owner_repo_tuple):
        return '/'.join(owner_repo_tuple)

    repo = dict(db_repo or {}, **{
        'owner': owner,
        'pushed_at': util.to_timestamp(pushed_date),
        'repo_name': repo_name,
        'search_keyword': search_keyword,
        'vundle_repos': map(stringify_repo, vundle_repos),
        'neobundle_repos': map(stringify_repo, neobundle_repos),
        'vimplug_repos': map(stringify_repo, vimplug_repos),
        'pathogen_repos': map(stringify_repo, pathogen_repos),
    })

    DotfilesGithubRepos.log_scrape(repo)
    DotfilesGithubRepos.upsert_with_owner_repo(repo)

    print 'found %s Vundles, %s NeoBundles, %s VimPlugs, %s Pathogens' % (
            len(vundle_repos), len(neobundle_repos),
            len(vimplug_repos), len(pathogen_repos))

    return {
        'vundle_repos_count': len(vundle_repos),
        'neobundle_repos_count': len(neobundle_repos),
        'vimplug_repos_count': len(vimplug_repos),
        'pathogen_repos_count': len(pathogen_repos),
    }
Esempio n. 3
0
def scrape_dotfiles_repos(num):
    """Scrape at most num dotfiles repos from GitHub for references to Vim
    plugin repos.

    We perform a search on GitHub repositories that are likely to contain
    Vundle and Pathogen bundles instead of a code search matching
    Vundle/Pathogen commands (which has higher precision and recall), because
    GitHub's API requires code search to be limited to
    a user/repo/organization. :(
    """
    # Earliest allowable updated date to start scraping from (so we won't be
    # scraping repos that were last pushed before this date).
    EARLIEST_PUSHED_DATE = datetime.datetime(2013, 1, 1)

    repos_scraped = 0
    scraped_counter = collections.Counter()

    for repo_name in _DOTFILE_REPO_NAMES:
        latest_repo = DotfilesGithubRepos.get_latest_with_keyword(repo_name)

        if latest_repo and latest_repo.get('pushed_at'):
            last_pushed_date = max(
                datetime.datetime.utcfromtimestamp(latest_repo['pushed_at']),
                EARLIEST_PUSHED_DATE)
        else:
            last_pushed_date = EARLIEST_PUSHED_DATE

        # We're going to scrape all repos updated after the latest updated repo
        # in our DB, starting with the least recently updated.  This maintains
        # the invariant that we have scraped all repos pushed before the latest
        # push date (and after EARLIEST_PUSHED_DATE).
        while True:

            start_date_iso = last_pushed_date.isoformat()
            search_params = {
                'q': '%s in:name pushed:>%s' % (repo_name, start_date_iso),
                'sort': 'updated',
                'order': 'asc',
            }

            per_page = 100
            response, search_data = get_api_page('search/repositories',
                                                 query_params=search_params,
                                                 page=1,
                                                 per_page=per_page)

            items = search_data.get('items', [])
            for item in items:
                try:
                    stats = _get_plugin_repos_from_dotfiles(item, repo_name)
                except ApiRateLimitExceededError:
                    logging.exception('API rate limit exceeded.')
                    return repos_scraped, scraped_counter
                except Exception:
                    logging.exception('Error scraping dotfiles repo %s' %
                                      item['full_name'])
                    stats = {}

                scraped_counter.update(stats)

                # If we've scraped the number repos desired, we can quit.
                repos_scraped += 1
                if repos_scraped >= num:
                    return repos_scraped, scraped_counter

            # If we're about to exceed the rate limit (20 requests / min),
            # sleep until the limit resets.
            maybe_wait_until_api_limit_resets(response.headers)

            # If we've scraped all repos with this name, move on to the next
            # repo name.
            if len(items) < per_page:
                break
            else:
                last_pushed_date = dateutil.parser.parse(
                    items[-1]['pushed_at'])

    return repos_scraped, scraped_counter
Esempio n. 4
0
def scrape_dotfiles_repos(num):
    """Scrape at most num dotfiles repos from GitHub for references to Vim
    plugin repos.

    We perform a search on GitHub repositories that are likely to contain
    Vundle and Pathogen bundles instead of a code search matching
    Vundle/Pathogen commands (which has higher precision and recall), because
    GitHub's API requires code search to be limited to
    a user/repo/organization. :(
    """
    # Earliest allowable updated date to start scraping from (so we won't be
    # scraping repos that were last pushed before this date).
    EARLIEST_PUSHED_DATE = datetime.datetime(2013, 1, 1)

    repos_scraped = 0
    scraped_counter = collections.Counter()

    for repo_name in _DOTFILE_REPO_NAMES:
        latest_repo = DotfilesGithubRepos.get_latest_with_keyword(repo_name)

        if latest_repo and latest_repo.get('pushed_at'):
            last_pushed_date = max(datetime.datetime.utcfromtimestamp(
                    latest_repo['pushed_at']), EARLIEST_PUSHED_DATE)
        else:
            last_pushed_date = EARLIEST_PUSHED_DATE

        # We're going to scrape all repos updated after the latest updated repo
        # in our DB, starting with the least recently updated.  This maintains
        # the invariant that we have scraped all repos pushed before the latest
        # push date (and after EARLIEST_PUSHED_DATE).
        while True:

            start_date_iso = last_pushed_date.isoformat()
            search_params = {
                'q': '%s in:name pushed:>%s' % (repo_name, start_date_iso),
                'sort': 'updated',
                'order': 'asc',
            }

            per_page = 100
            response, search_data = get_api_page('search/repositories',
                    query_params=search_params, page=1, per_page=per_page)

            items = search_data.get('items', [])
            for item in items:
                try:
                    stats = _get_plugin_repos_from_dotfiles(item, repo_name)
                except ApiRateLimitExceededError:
                    logging.exception('API rate limit exceeded.')
                    return repos_scraped, scraped_counter
                except Exception:
                    logging.exception('Error scraping dotfiles repo %s' %
                            item['full_name'])
                    stats = {}

                scraped_counter.update(stats)

                # If we've scraped the number repos desired, we can quit.
                repos_scraped += 1
                if repos_scraped >= num:
                    return repos_scraped, scraped_counter

            # If we're about to exceed the rate limit (20 requests / min),
            # sleep until the limit resets.
            maybe_wait_until_api_limit_resets(response.headers)

            # If we've scraped all repos with this name, move on to the next
            # repo name.
            if len(items) < per_page:
                break
            else:
                last_pushed_date = dateutil.parser.parse(
                        items[-1]['pushed_at'])

    return repos_scraped, scraped_counter