def _get_plugin_repos_from_dotfiles(repo_data, search_keyword): """Search for references to vim plugin repos from a dotfiles repository, and insert them into DB. Arguments: repo_data: API response from GitHub of a repository. search_keyword: The keyword used that found this repo. """ owner_repo = repo_data['full_name'] # Print w/o newline. print " scraping %s ..." % owner_repo, sys.stdout.flush() res, contents_data = get_api_page('repos/%s/contents' % owner_repo) if res.status_code == 404 or not isinstance(contents_data, list): print "contents not found" return repos_by_manager = _extract_bundle_repos_from_dir(contents_data) vundle_repos = repos_by_manager.vundle neobundle_repos = repos_by_manager.neobundle vimplug_repos = repos_by_manager.vimplug pathogen_repos = _extract_pathogen_repos(contents_data) owner, repo_name = owner_repo.split('/') db_repo = DotfilesGithubRepos.get_with_owner_repo(owner, repo_name) pushed_date = dateutil.parser.parse(repo_data['pushed_at']) def stringify_repo(owner_repo_tuple): return '/'.join(owner_repo_tuple) repo = dict( db_repo or {}, **{ 'owner': owner, 'pushed_at': util.to_timestamp(pushed_date), 'repo_name': repo_name, 'search_keyword': search_keyword, 'vundle_repos': map(stringify_repo, vundle_repos), 'neobundle_repos': map(stringify_repo, neobundle_repos), 'vimplug_repos': map(stringify_repo, vimplug_repos), 'pathogen_repos': map(stringify_repo, pathogen_repos), }) DotfilesGithubRepos.log_scrape(repo) DotfilesGithubRepos.upsert_with_owner_repo(repo) print 'found %s Vundles, %s NeoBundles, %s VimPlugs, %s Pathogens' % ( len(vundle_repos), len(neobundle_repos), len(vimplug_repos), len(pathogen_repos)) return { 'vundle_repos_count': len(vundle_repos), 'neobundle_repos_count': len(neobundle_repos), 'vimplug_repos_count': len(vimplug_repos), 'pathogen_repos_count': len(pathogen_repos), }
def _get_plugin_repos_from_dotfiles(repo_data, search_keyword): """Search for references to vim plugin repos from a dotfiles repository, and insert them into DB. Arguments: repo_data: API response from GitHub of a repository. search_keyword: The keyword used that found this repo. """ owner_repo = repo_data['full_name'] # Print w/o newline. print " scraping %s ..." % owner_repo, sys.stdout.flush() res, contents_data = get_api_page('repos/%s/contents' % owner_repo) if res.status_code == 404 or not isinstance(contents_data, list): print "contents not found" return repos_by_manager = _extract_bundle_repos_from_dir(contents_data) vundle_repos = repos_by_manager.vundle neobundle_repos = repos_by_manager.neobundle vimplug_repos = repos_by_manager.vimplug pathogen_repos = _extract_pathogen_repos(contents_data) owner, repo_name = owner_repo.split('/') db_repo = DotfilesGithubRepos.get_with_owner_repo(owner, repo_name) pushed_date = dateutil.parser.parse(repo_data['pushed_at']) def stringify_repo(owner_repo_tuple): return '/'.join(owner_repo_tuple) repo = dict(db_repo or {}, **{ 'owner': owner, 'pushed_at': util.to_timestamp(pushed_date), 'repo_name': repo_name, 'search_keyword': search_keyword, 'vundle_repos': map(stringify_repo, vundle_repos), 'neobundle_repos': map(stringify_repo, neobundle_repos), 'vimplug_repos': map(stringify_repo, vimplug_repos), 'pathogen_repos': map(stringify_repo, pathogen_repos), }) DotfilesGithubRepos.log_scrape(repo) DotfilesGithubRepos.upsert_with_owner_repo(repo) print 'found %s Vundles, %s NeoBundles, %s VimPlugs, %s Pathogens' % ( len(vundle_repos), len(neobundle_repos), len(vimplug_repos), len(pathogen_repos)) return { 'vundle_repos_count': len(vundle_repos), 'neobundle_repos_count': len(neobundle_repos), 'vimplug_repos_count': len(vimplug_repos), 'pathogen_repos_count': len(pathogen_repos), }
def scrape_dotfiles_repos(num): """Scrape at most num dotfiles repos from GitHub for references to Vim plugin repos. We perform a search on GitHub repositories that are likely to contain Vundle and Pathogen bundles instead of a code search matching Vundle/Pathogen commands (which has higher precision and recall), because GitHub's API requires code search to be limited to a user/repo/organization. :( """ # Earliest allowable updated date to start scraping from (so we won't be # scraping repos that were last pushed before this date). EARLIEST_PUSHED_DATE = datetime.datetime(2013, 1, 1) repos_scraped = 0 scraped_counter = collections.Counter() for repo_name in _DOTFILE_REPO_NAMES: latest_repo = DotfilesGithubRepos.get_latest_with_keyword(repo_name) if latest_repo and latest_repo.get('pushed_at'): last_pushed_date = max( datetime.datetime.utcfromtimestamp(latest_repo['pushed_at']), EARLIEST_PUSHED_DATE) else: last_pushed_date = EARLIEST_PUSHED_DATE # We're going to scrape all repos updated after the latest updated repo # in our DB, starting with the least recently updated. This maintains # the invariant that we have scraped all repos pushed before the latest # push date (and after EARLIEST_PUSHED_DATE). while True: start_date_iso = last_pushed_date.isoformat() search_params = { 'q': '%s in:name pushed:>%s' % (repo_name, start_date_iso), 'sort': 'updated', 'order': 'asc', } per_page = 100 response, search_data = get_api_page('search/repositories', query_params=search_params, page=1, per_page=per_page) items = search_data.get('items', []) for item in items: try: stats = _get_plugin_repos_from_dotfiles(item, repo_name) except ApiRateLimitExceededError: logging.exception('API rate limit exceeded.') return repos_scraped, scraped_counter except Exception: logging.exception('Error scraping dotfiles repo %s' % item['full_name']) stats = {} scraped_counter.update(stats) # If we've scraped the number repos desired, we can quit. repos_scraped += 1 if repos_scraped >= num: return repos_scraped, scraped_counter # If we're about to exceed the rate limit (20 requests / min), # sleep until the limit resets. maybe_wait_until_api_limit_resets(response.headers) # If we've scraped all repos with this name, move on to the next # repo name. if len(items) < per_page: break else: last_pushed_date = dateutil.parser.parse( items[-1]['pushed_at']) return repos_scraped, scraped_counter
def scrape_dotfiles_repos(num): """Scrape at most num dotfiles repos from GitHub for references to Vim plugin repos. We perform a search on GitHub repositories that are likely to contain Vundle and Pathogen bundles instead of a code search matching Vundle/Pathogen commands (which has higher precision and recall), because GitHub's API requires code search to be limited to a user/repo/organization. :( """ # Earliest allowable updated date to start scraping from (so we won't be # scraping repos that were last pushed before this date). EARLIEST_PUSHED_DATE = datetime.datetime(2013, 1, 1) repos_scraped = 0 scraped_counter = collections.Counter() for repo_name in _DOTFILE_REPO_NAMES: latest_repo = DotfilesGithubRepos.get_latest_with_keyword(repo_name) if latest_repo and latest_repo.get('pushed_at'): last_pushed_date = max(datetime.datetime.utcfromtimestamp( latest_repo['pushed_at']), EARLIEST_PUSHED_DATE) else: last_pushed_date = EARLIEST_PUSHED_DATE # We're going to scrape all repos updated after the latest updated repo # in our DB, starting with the least recently updated. This maintains # the invariant that we have scraped all repos pushed before the latest # push date (and after EARLIEST_PUSHED_DATE). while True: start_date_iso = last_pushed_date.isoformat() search_params = { 'q': '%s in:name pushed:>%s' % (repo_name, start_date_iso), 'sort': 'updated', 'order': 'asc', } per_page = 100 response, search_data = get_api_page('search/repositories', query_params=search_params, page=1, per_page=per_page) items = search_data.get('items', []) for item in items: try: stats = _get_plugin_repos_from_dotfiles(item, repo_name) except ApiRateLimitExceededError: logging.exception('API rate limit exceeded.') return repos_scraped, scraped_counter except Exception: logging.exception('Error scraping dotfiles repo %s' % item['full_name']) stats = {} scraped_counter.update(stats) # If we've scraped the number repos desired, we can quit. repos_scraped += 1 if repos_scraped >= num: return repos_scraped, scraped_counter # If we're about to exceed the rate limit (20 requests / min), # sleep until the limit resets. maybe_wait_until_api_limit_resets(response.headers) # If we've scraped all repos with this name, move on to the next # repo name. if len(items) < per_page: break else: last_pushed_date = dateutil.parser.parse( items[-1]['pushed_at']) return repos_scraped, scraped_counter