Example #1
0
def get_plugin_info(vimorg_id):
    """Gets some more detailed information about a vim.org script

    Scrapes a given vim.org script page, and returns some detailed information
    about the plugin that is not available from the search page, like how many
    people rated a plugin, the author's name, and a long description.
    """
    res = requests.get('http://www.vim.org/scripts/script.php?script_id=%s' %
                       vimorg_id,
                       timeout=10)

    html = lxml.html.html5parser.document_fromstring(res.text, parser=PARSER)

    rating = html.xpath('//td[contains(text(),"Rating")]/b')[0]
    rating_denom = int(re.search("(\d+)/(\d+)", rating.text).group(2))

    body_trs = html.xpath(
        '//table[tbody/tr/td[contains(@class,"prompt")]]/*/*')

    assert body_trs[0][0].text == "created by"
    creator = body_trs[1][0][0].text

    assert body_trs[6][0].text == "description"
    description_node = body_trs[7][0]

    assert body_trs[9][0].text == "install details"
    install_node = body_trs[10][0]

    download_trs = html.xpath(
        '//table[tbody/tr/th[text()="release notes"]]/*/*')

    # Parse created and updated dates
    assert download_trs[0][2].text == "date"
    updated_date_text = download_trs[1][2][0].text
    created_date_text = download_trs[-1][2][0].text

    date_format = "%Y-%m-%d"
    updated_date = datetime.datetime.strptime(updated_date_text, date_format)
    created_date = datetime.datetime.strptime(created_date_text, date_format)

    return {
        "vimorg_num_raters": rating_denom,
        "vimorg_author": creator,
        "vimorg_long_desc": _get_inner_text(description_node),
        "vimorg_install_details": _get_inner_text(install_node),
        "updated_at": util.to_timestamp(updated_date),
        "created_at": util.to_timestamp(created_date),
    }
Example #2
0
def get_plugin_info(vimorg_id):
    """Gets some more detailed information about a vim.org script

    Scrapes a given vim.org script page, and returns some detailed information
    about the plugin that is not available from the search page, like how many
    people rated a plugin, the author's name, and a long description.
    """
    res = requests.get('http://www.vim.org/scripts/script.php?script_id=%s' %
            vimorg_id, timeout=10)

    html = lxml.html.html5parser.document_fromstring(res.text, parser=PARSER)

    rating = html.xpath('//td[contains(text(),"Rating")]/b')[0]
    rating_denom = int(re.search("(\d+)/(\d+)", rating.text).group(2))

    body_trs = html.xpath(
            '//table[tbody/tr/td[contains(@class,"prompt")]]/*/*')

    assert body_trs[0][0].text == "created by"
    creator = body_trs[1][0][0].text

    assert body_trs[6][0].text == "description"
    description_node = body_trs[7][0]

    assert body_trs[9][0].text == "install details"
    install_node = body_trs[10][0]

    download_trs = html.xpath(
            '//table[tbody/tr/th[text()="release notes"]]/*/*')

    # Parse created and updated dates
    assert download_trs[0][2].text == "date"
    updated_date_text = download_trs[1][2][0].text
    created_date_text = download_trs[-1][2][0].text

    date_format = "%Y-%m-%d"
    updated_date = datetime.datetime.strptime(updated_date_text, date_format)
    created_date = datetime.datetime.strptime(created_date_text, date_format)

    return {
        "vimorg_num_raters": rating_denom,
        "vimorg_author": creator,
        "vimorg_long_desc": _get_inner_text(description_node),
        "vimorg_install_details": _get_inner_text(install_node),
        "updated_at": util.to_timestamp(updated_date),
        "created_at": util.to_timestamp(created_date),
    }
Example #3
0
def _get_plugin_repos_from_dotfiles(repo_data, search_keyword):
    """Search for references to vim plugin repos from a dotfiles repository,
    and insert them into DB.

    Arguments:
        repo_data: API response from GitHub of a repository.
        search_keyword: The keyword used that found this repo.
    """
    owner_repo = repo_data['full_name']

    # Print w/o newline.
    print "    scraping %s ..." % owner_repo,
    sys.stdout.flush()

    res, contents_data = get_api_page('repos/%s/contents' % owner_repo)

    if res.status_code == 404 or not isinstance(contents_data, list):
        print "contents not found"
        return

    repos_by_manager = _extract_bundle_repos_from_dir(contents_data)
    vundle_repos = repos_by_manager.vundle
    neobundle_repos = repos_by_manager.neobundle
    vimplug_repos = repos_by_manager.vimplug

    pathogen_repos = _extract_pathogen_repos(contents_data)

    owner, repo_name = owner_repo.split('/')
    db_repo = DotfilesGithubRepos.get_with_owner_repo(owner, repo_name)
    pushed_date = dateutil.parser.parse(repo_data['pushed_at'])

    def stringify_repo(owner_repo_tuple):
        return '/'.join(owner_repo_tuple)

    repo = dict(
        db_repo or {}, **{
            'owner': owner,
            'pushed_at': util.to_timestamp(pushed_date),
            'repo_name': repo_name,
            'search_keyword': search_keyword,
            'vundle_repos': map(stringify_repo, vundle_repos),
            'neobundle_repos': map(stringify_repo, neobundle_repos),
            'vimplug_repos': map(stringify_repo, vimplug_repos),
            'pathogen_repos': map(stringify_repo, pathogen_repos),
        })

    DotfilesGithubRepos.log_scrape(repo)
    DotfilesGithubRepos.upsert_with_owner_repo(repo)

    print 'found %s Vundles, %s NeoBundles, %s VimPlugs, %s Pathogens' % (
        len(vundle_repos), len(neobundle_repos), len(vimplug_repos),
        len(pathogen_repos))

    return {
        'vundle_repos_count': len(vundle_repos),
        'neobundle_repos_count': len(neobundle_repos),
        'vimplug_repos_count': len(vimplug_repos),
        'pathogen_repos_count': len(pathogen_repos),
    }
Example #4
0
def _get_plugin_repos_from_dotfiles(repo_data, search_keyword):
    """Search for references to vim plugin repos from a dotfiles repository,
    and insert them into DB.

    Arguments:
        repo_data: API response from GitHub of a repository.
        search_keyword: The keyword used that found this repo.
    """
    owner_repo = repo_data['full_name']

    # Print w/o newline.
    print "    scraping %s ..." % owner_repo,
    sys.stdout.flush()

    res, contents_data = get_api_page('repos/%s/contents' % owner_repo)

    if res.status_code == 404 or not isinstance(contents_data, list):
        print "contents not found"
        return

    repos_by_manager = _extract_bundle_repos_from_dir(contents_data)
    vundle_repos = repos_by_manager.vundle
    neobundle_repos = repos_by_manager.neobundle
    vimplug_repos = repos_by_manager.vimplug

    pathogen_repos = _extract_pathogen_repos(contents_data)

    owner, repo_name = owner_repo.split('/')
    db_repo = DotfilesGithubRepos.get_with_owner_repo(owner, repo_name)
    pushed_date = dateutil.parser.parse(repo_data['pushed_at'])

    def stringify_repo(owner_repo_tuple):
        return '/'.join(owner_repo_tuple)

    repo = dict(db_repo or {}, **{
        'owner': owner,
        'pushed_at': util.to_timestamp(pushed_date),
        'repo_name': repo_name,
        'search_keyword': search_keyword,
        'vundle_repos': map(stringify_repo, vundle_repos),
        'neobundle_repos': map(stringify_repo, neobundle_repos),
        'vimplug_repos': map(stringify_repo, vimplug_repos),
        'pathogen_repos': map(stringify_repo, pathogen_repos),
    })

    DotfilesGithubRepos.log_scrape(repo)
    DotfilesGithubRepos.upsert_with_owner_repo(repo)

    print 'found %s Vundles, %s NeoBundles, %s VimPlugs, %s Pathogens' % (
            len(vundle_repos), len(neobundle_repos),
            len(vimplug_repos), len(pathogen_repos))

    return {
        'vundle_repos_count': len(vundle_repos),
        'neobundle_repos_count': len(neobundle_repos),
        'vimplug_repos_count': len(vimplug_repos),
        'pathogen_repos_count': len(pathogen_repos),
    }
Example #5
0
def get_plugin_data(owner, repo_name, repo_data, readme_data=None):
    """Populate info relevant to a plugin from a GitHub repo.

    This should not be used to fetch info from the vim-scripts user's repos.

    Arguments:
        owner: The repo's owner's login, eg. "gmarik"
        repo_name: The repo name, eg. "vundle"
        repo_data: GitHub API /repos response for this repo
        readme_data: (optional) GitHub API /readme response for this repo
        scrape_fork: Whether to bother scraping this repo if it's a fork

    Returns:
        A dict of properties that can be inserted as a row in the plugins table
    """
    assert owner != 'vim-scripts'

    if not readme_data:
        _, readme_data = get_api_page('repos/%s/%s/readme' %
                                      (owner, repo_name))

    readme_base64_decoded = base64.b64decode(readme_data.get('content', ''))
    readme = unicode(readme_base64_decoded, 'utf-8', errors='ignore')
    readme_filename = readme_data.get('name', '')

    # TODO(david): We used to extract the vim.org ID from the homepage if it
    #     were a vim.org URL, but that became too unreliable as many different
    #     repos would all claim to have the same vim.org homepage, when
    #     sometimes those repos were of different plugins. But it's still
    #     useful information in heuristic matching, just can't be used as
    #     a key.
    homepage = repo_data['homepage']

    repo_created_date = dateutil.parser.parse(repo_data['created_at'])

    # Fetch commits so we can get the update/create dates.
    _, commits_data = get_api_page('repos/%s/%s/commits' % (owner, repo_name),
                                   per_page=100)

    if commits_data and isinstance(commits_data, list) and len(commits_data):

        # Unfortunately repo_data['updated_at'] and repo_data['pushed_at'] are
        # wildy misrepresentative of the last time someone made a commit to the
        # repo.
        updated_date_text = commits_data[0]['commit']['author']['date']
        updated_date = dateutil.parser.parse(updated_date_text)

        # To get the creation date, we use the heuristic of min(repo creation
        # date, 100th latest commit date). We do this because repo creation
        # date can be later than the date of the first commit, which is
        # particularly pervasive for vim-scripts repos. Fortunately, most
        # vim-scripts repos don't have more than 100 commits, and also we get
        # creation_date for vim-scripts repos when scraping vim.org.
        early_commit_date_text = commits_data[-1]['commit']['author']['date']
        early_commit_date = dateutil.parser.parse(early_commit_date_text)
        created_date = min(repo_created_date, early_commit_date)

    else:
        updated_date = dateutil.parser.parse(repo_data['updated_at'])
        created_date = repo_created_date

    # Fetch owner info to get author name.
    owner_login = repo_data['owner']['login']
    _, owner_data = get_api_page('users/%s' % owner_login)
    author = owner_data.get('name') or owner_data.get('login')

    return {
        'created_at': util.to_timestamp(created_date),
        'updated_at': util.to_timestamp(updated_date),
        'vimorg_id': None,
        'github_repo_id': str(repo_data['id']),
        'github_owner': owner,
        'github_repo_name': repo_name,
        'github_author': author,
        'github_stars': repo_data['watchers'],
        'github_homepage': homepage,
        'github_short_desc': repo_data['description'],
        'github_readme': readme,
        'github_readme_filename': readme_filename,
    }
Example #6
0
def get_plugin_data(owner, repo_name, repo_data, readme_data=None):
    """Populate info relevant to a plugin from a GitHub repo.

    This should not be used to fetch info from the vim-scripts user's repos.

    Arguments:
        owner: The repo's owner's login, eg. "gmarik"
        repo_name: The repo name, eg. "vundle"
        repo_data: GitHub API /repos response for this repo
        readme_data: (optional) GitHub API /readme response for this repo
        scrape_fork: Whether to bother scraping this repo if it's a fork

    Returns:
        A dict of properties that can be inserted as a row in the plugins table
    """
    assert owner != 'vim-scripts'

    if not readme_data:
        _, readme_data = get_api_page('repos/%s/%s/readme' % (
            owner, repo_name))

    readme_base64_decoded = base64.b64decode(readme_data.get('content', ''))
    readme = unicode(readme_base64_decoded, 'utf-8', errors='ignore')
    readme_filename = readme_data.get('name', '')

    # TODO(david): We used to extract the vim.org ID from the homepage if it
    #     were a vim.org URL, but that became too unreliable as many different
    #     repos would all claim to have the same vim.org homepage, when
    #     sometimes those repos were of different plugins. But it's still
    #     useful information in heuristic matching, just can't be used as
    #     a key.
    homepage = repo_data['homepage']

    repo_created_date = dateutil.parser.parse(repo_data['created_at'])

    # Fetch commits so we can get the update/create dates.
    _, commits_data = get_api_page('repos/%s/%s/commits' % (owner, repo_name),
            per_page=100)

    if commits_data and isinstance(commits_data, list) and len(commits_data):

        # Unfortunately repo_data['updated_at'] and repo_data['pushed_at'] are
        # wildy misrepresentative of the last time someone made a commit to the
        # repo.
        updated_date_text = commits_data[0]['commit']['author']['date']
        updated_date = dateutil.parser.parse(updated_date_text)

        # To get the creation date, we use the heuristic of min(repo creation
        # date, 100th latest commit date). We do this because repo creation
        # date can be later than the date of the first commit, which is
        # particularly pervasive for vim-scripts repos. Fortunately, most
        # vim-scripts repos don't have more than 100 commits, and also we get
        # creation_date for vim-scripts repos when scraping vim.org.
        early_commit_date_text = commits_data[-1]['commit']['author']['date']
        early_commit_date = dateutil.parser.parse(early_commit_date_text)
        created_date = min(repo_created_date, early_commit_date)

    else:
        updated_date = dateutil.parser.parse(repo_data['updated_at'])
        created_date = repo_created_date

    # Fetch owner info to get author name.
    owner_login = repo_data['owner']['login']
    _, owner_data = get_api_page('users/%s' % owner_login)
    author = owner_data.get('name') or owner_data.get('login')

    return {
        'created_at': util.to_timestamp(created_date),
        'updated_at': util.to_timestamp(updated_date),
        'vimorg_id': None,
        'github_repo_id': str(repo_data['id']),
        'github_owner': owner,
        'github_repo_name': repo_name,
        'github_author': author,
        'github_stars': repo_data['watchers'],
        'github_homepage': homepage,
        'github_short_desc': repo_data['description'],
        'github_readme': readme,
        'github_readme_filename': readme_filename,
    }