def get_plugin_info(vimorg_id): """Gets some more detailed information about a vim.org script Scrapes a given vim.org script page, and returns some detailed information about the plugin that is not available from the search page, like how many people rated a plugin, the author's name, and a long description. """ res = requests.get('http://www.vim.org/scripts/script.php?script_id=%s' % vimorg_id, timeout=10) html = lxml.html.html5parser.document_fromstring(res.text, parser=PARSER) rating = html.xpath('//td[contains(text(),"Rating")]/b')[0] rating_denom = int(re.search("(\d+)/(\d+)", rating.text).group(2)) body_trs = html.xpath( '//table[tbody/tr/td[contains(@class,"prompt")]]/*/*') assert body_trs[0][0].text == "created by" creator = body_trs[1][0][0].text assert body_trs[6][0].text == "description" description_node = body_trs[7][0] assert body_trs[9][0].text == "install details" install_node = body_trs[10][0] download_trs = html.xpath( '//table[tbody/tr/th[text()="release notes"]]/*/*') # Parse created and updated dates assert download_trs[0][2].text == "date" updated_date_text = download_trs[1][2][0].text created_date_text = download_trs[-1][2][0].text date_format = "%Y-%m-%d" updated_date = datetime.datetime.strptime(updated_date_text, date_format) created_date = datetime.datetime.strptime(created_date_text, date_format) return { "vimorg_num_raters": rating_denom, "vimorg_author": creator, "vimorg_long_desc": _get_inner_text(description_node), "vimorg_install_details": _get_inner_text(install_node), "updated_at": util.to_timestamp(updated_date), "created_at": util.to_timestamp(created_date), }
def _get_plugin_repos_from_dotfiles(repo_data, search_keyword): """Search for references to vim plugin repos from a dotfiles repository, and insert them into DB. Arguments: repo_data: API response from GitHub of a repository. search_keyword: The keyword used that found this repo. """ owner_repo = repo_data['full_name'] # Print w/o newline. print " scraping %s ..." % owner_repo, sys.stdout.flush() res, contents_data = get_api_page('repos/%s/contents' % owner_repo) if res.status_code == 404 or not isinstance(contents_data, list): print "contents not found" return repos_by_manager = _extract_bundle_repos_from_dir(contents_data) vundle_repos = repos_by_manager.vundle neobundle_repos = repos_by_manager.neobundle vimplug_repos = repos_by_manager.vimplug pathogen_repos = _extract_pathogen_repos(contents_data) owner, repo_name = owner_repo.split('/') db_repo = DotfilesGithubRepos.get_with_owner_repo(owner, repo_name) pushed_date = dateutil.parser.parse(repo_data['pushed_at']) def stringify_repo(owner_repo_tuple): return '/'.join(owner_repo_tuple) repo = dict( db_repo or {}, **{ 'owner': owner, 'pushed_at': util.to_timestamp(pushed_date), 'repo_name': repo_name, 'search_keyword': search_keyword, 'vundle_repos': map(stringify_repo, vundle_repos), 'neobundle_repos': map(stringify_repo, neobundle_repos), 'vimplug_repos': map(stringify_repo, vimplug_repos), 'pathogen_repos': map(stringify_repo, pathogen_repos), }) DotfilesGithubRepos.log_scrape(repo) DotfilesGithubRepos.upsert_with_owner_repo(repo) print 'found %s Vundles, %s NeoBundles, %s VimPlugs, %s Pathogens' % ( len(vundle_repos), len(neobundle_repos), len(vimplug_repos), len(pathogen_repos)) return { 'vundle_repos_count': len(vundle_repos), 'neobundle_repos_count': len(neobundle_repos), 'vimplug_repos_count': len(vimplug_repos), 'pathogen_repos_count': len(pathogen_repos), }
def _get_plugin_repos_from_dotfiles(repo_data, search_keyword): """Search for references to vim plugin repos from a dotfiles repository, and insert them into DB. Arguments: repo_data: API response from GitHub of a repository. search_keyword: The keyword used that found this repo. """ owner_repo = repo_data['full_name'] # Print w/o newline. print " scraping %s ..." % owner_repo, sys.stdout.flush() res, contents_data = get_api_page('repos/%s/contents' % owner_repo) if res.status_code == 404 or not isinstance(contents_data, list): print "contents not found" return repos_by_manager = _extract_bundle_repos_from_dir(contents_data) vundle_repos = repos_by_manager.vundle neobundle_repos = repos_by_manager.neobundle vimplug_repos = repos_by_manager.vimplug pathogen_repos = _extract_pathogen_repos(contents_data) owner, repo_name = owner_repo.split('/') db_repo = DotfilesGithubRepos.get_with_owner_repo(owner, repo_name) pushed_date = dateutil.parser.parse(repo_data['pushed_at']) def stringify_repo(owner_repo_tuple): return '/'.join(owner_repo_tuple) repo = dict(db_repo or {}, **{ 'owner': owner, 'pushed_at': util.to_timestamp(pushed_date), 'repo_name': repo_name, 'search_keyword': search_keyword, 'vundle_repos': map(stringify_repo, vundle_repos), 'neobundle_repos': map(stringify_repo, neobundle_repos), 'vimplug_repos': map(stringify_repo, vimplug_repos), 'pathogen_repos': map(stringify_repo, pathogen_repos), }) DotfilesGithubRepos.log_scrape(repo) DotfilesGithubRepos.upsert_with_owner_repo(repo) print 'found %s Vundles, %s NeoBundles, %s VimPlugs, %s Pathogens' % ( len(vundle_repos), len(neobundle_repos), len(vimplug_repos), len(pathogen_repos)) return { 'vundle_repos_count': len(vundle_repos), 'neobundle_repos_count': len(neobundle_repos), 'vimplug_repos_count': len(vimplug_repos), 'pathogen_repos_count': len(pathogen_repos), }
def get_plugin_data(owner, repo_name, repo_data, readme_data=None): """Populate info relevant to a plugin from a GitHub repo. This should not be used to fetch info from the vim-scripts user's repos. Arguments: owner: The repo's owner's login, eg. "gmarik" repo_name: The repo name, eg. "vundle" repo_data: GitHub API /repos response for this repo readme_data: (optional) GitHub API /readme response for this repo scrape_fork: Whether to bother scraping this repo if it's a fork Returns: A dict of properties that can be inserted as a row in the plugins table """ assert owner != 'vim-scripts' if not readme_data: _, readme_data = get_api_page('repos/%s/%s/readme' % (owner, repo_name)) readme_base64_decoded = base64.b64decode(readme_data.get('content', '')) readme = unicode(readme_base64_decoded, 'utf-8', errors='ignore') readme_filename = readme_data.get('name', '') # TODO(david): We used to extract the vim.org ID from the homepage if it # were a vim.org URL, but that became too unreliable as many different # repos would all claim to have the same vim.org homepage, when # sometimes those repos were of different plugins. But it's still # useful information in heuristic matching, just can't be used as # a key. homepage = repo_data['homepage'] repo_created_date = dateutil.parser.parse(repo_data['created_at']) # Fetch commits so we can get the update/create dates. _, commits_data = get_api_page('repos/%s/%s/commits' % (owner, repo_name), per_page=100) if commits_data and isinstance(commits_data, list) and len(commits_data): # Unfortunately repo_data['updated_at'] and repo_data['pushed_at'] are # wildy misrepresentative of the last time someone made a commit to the # repo. updated_date_text = commits_data[0]['commit']['author']['date'] updated_date = dateutil.parser.parse(updated_date_text) # To get the creation date, we use the heuristic of min(repo creation # date, 100th latest commit date). We do this because repo creation # date can be later than the date of the first commit, which is # particularly pervasive for vim-scripts repos. Fortunately, most # vim-scripts repos don't have more than 100 commits, and also we get # creation_date for vim-scripts repos when scraping vim.org. early_commit_date_text = commits_data[-1]['commit']['author']['date'] early_commit_date = dateutil.parser.parse(early_commit_date_text) created_date = min(repo_created_date, early_commit_date) else: updated_date = dateutil.parser.parse(repo_data['updated_at']) created_date = repo_created_date # Fetch owner info to get author name. owner_login = repo_data['owner']['login'] _, owner_data = get_api_page('users/%s' % owner_login) author = owner_data.get('name') or owner_data.get('login') return { 'created_at': util.to_timestamp(created_date), 'updated_at': util.to_timestamp(updated_date), 'vimorg_id': None, 'github_repo_id': str(repo_data['id']), 'github_owner': owner, 'github_repo_name': repo_name, 'github_author': author, 'github_stars': repo_data['watchers'], 'github_homepage': homepage, 'github_short_desc': repo_data['description'], 'github_readme': readme, 'github_readme_filename': readme_filename, }
def get_plugin_data(owner, repo_name, repo_data, readme_data=None): """Populate info relevant to a plugin from a GitHub repo. This should not be used to fetch info from the vim-scripts user's repos. Arguments: owner: The repo's owner's login, eg. "gmarik" repo_name: The repo name, eg. "vundle" repo_data: GitHub API /repos response for this repo readme_data: (optional) GitHub API /readme response for this repo scrape_fork: Whether to bother scraping this repo if it's a fork Returns: A dict of properties that can be inserted as a row in the plugins table """ assert owner != 'vim-scripts' if not readme_data: _, readme_data = get_api_page('repos/%s/%s/readme' % ( owner, repo_name)) readme_base64_decoded = base64.b64decode(readme_data.get('content', '')) readme = unicode(readme_base64_decoded, 'utf-8', errors='ignore') readme_filename = readme_data.get('name', '') # TODO(david): We used to extract the vim.org ID from the homepage if it # were a vim.org URL, but that became too unreliable as many different # repos would all claim to have the same vim.org homepage, when # sometimes those repos were of different plugins. But it's still # useful information in heuristic matching, just can't be used as # a key. homepage = repo_data['homepage'] repo_created_date = dateutil.parser.parse(repo_data['created_at']) # Fetch commits so we can get the update/create dates. _, commits_data = get_api_page('repos/%s/%s/commits' % (owner, repo_name), per_page=100) if commits_data and isinstance(commits_data, list) and len(commits_data): # Unfortunately repo_data['updated_at'] and repo_data['pushed_at'] are # wildy misrepresentative of the last time someone made a commit to the # repo. updated_date_text = commits_data[0]['commit']['author']['date'] updated_date = dateutil.parser.parse(updated_date_text) # To get the creation date, we use the heuristic of min(repo creation # date, 100th latest commit date). We do this because repo creation # date can be later than the date of the first commit, which is # particularly pervasive for vim-scripts repos. Fortunately, most # vim-scripts repos don't have more than 100 commits, and also we get # creation_date for vim-scripts repos when scraping vim.org. early_commit_date_text = commits_data[-1]['commit']['author']['date'] early_commit_date = dateutil.parser.parse(early_commit_date_text) created_date = min(repo_created_date, early_commit_date) else: updated_date = dateutil.parser.parse(repo_data['updated_at']) created_date = repo_created_date # Fetch owner info to get author name. owner_login = repo_data['owner']['login'] _, owner_data = get_api_page('users/%s' % owner_login) author = owner_data.get('name') or owner_data.get('login') return { 'created_at': util.to_timestamp(created_date), 'updated_at': util.to_timestamp(updated_date), 'vimorg_id': None, 'github_repo_id': str(repo_data['id']), 'github_owner': owner, 'github_repo_name': repo_name, 'github_author': author, 'github_stars': repo_data['watchers'], 'github_homepage': homepage, 'github_short_desc': repo_data['description'], 'github_readme': readme, 'github_readme_filename': readme_filename, }