Ejemplo n.º 1
0
def main():

    for p in Package.select().where(Package.readme != ''):

        readme_text = p.readme
        html = markdown.markdown(readme_text)
        soup = BeautifulSoup(html, 'html.parser')

        # This is a heuristic for word-count.
        # It will be not be precisely correct, depending on your definition of word.
        # For example, a path like 'com.app.example' is split into three words here.
        word_count = len(re.findall('\w+', soup.text))

        # Another heuristic.  As it's typical that inline code examples occur in <pre>
        # blocks, especially in formatted markdown, we count code blocks based
        # on the appearance of <pre> tags.
        code_blocks = soup.find_all('pre')
        block_count = len(code_blocks)

        try:
            analysis = ReadmeAnalysis.get(ReadmeAnalysis.package == p)
        except ReadmeAnalysis.DoesNotExist:
            analysis = ReadmeAnalysis.create(
                package=p, code_count=block_count, word_count=word_count
            )
            logging.debug("Created README analysis for package %s", p.name)
        else:
            analysis.code_count = block_count
            analysis.word_count = word_count
            analysis.save()
            logging.debug("Updated README analysis for package %s", p.name)
Ejemplo n.º 2
0
def main():

    for p in Package.select().where(Package.readme != ''):

        # This is a heuristic for word-count.
        # It will be not be precisely correct, depending on your definition of word.
        # For example, a path like 'com.app.example' is split into three words here.
        word_count = len(re.findall('\w+', p.readme))

        # Another heuristic.
        # In reStructuredText (reST), code blocks are introduced by ending a paragraph
        # with a special marker ::. The block must be indented and separated from the
        # surrounding paragraphs by blank lines. Thus, there must be at least two new line
        # characters after the special marker ::.

        # This may prove to be a broken heuristic. In that case, consider using Sphinx:
        # http://www.sphinx-doc.org/en/stable/index.html.
        block_count = len(re.findall('::.*\\n\\n', p.readme))

        try:
            analysis = ReadmeAnalysis.get(ReadmeAnalysis.package == p)
        except ReadmeAnalysis.DoesNotExist:
            analysis = ReadmeAnalysis.create(
                package=p, code_count=block_count, word_count=word_count
            )
            logging.debug("Created README analysis for package %s", p.name)
        else:
            analysis.code_count = block_count
            analysis.word_count = word_count
            analysis.save()
            logging.debug("Updated README analysis for package %s", p.name)

    logging.info("Finished analyzing READMEs.")
Ejemplo n.º 3
0
if __name__ == '__main__':

    parser = argparse.ArgumentParser(description="Download package stats for PyPI")
    parser.add_argument(
        '--package-list',
        action='store_true',
        help="fetch list of all packages on PyPI"
    )
    parser.add_argument(
        '--pypi-data',
        action='store_true',
        help="fetch PyPI data (READMES and downloads)"
    )
    parser.add_argument(
        '--update',
        action='store_true',
        help="only update existing data (currently only for --pypi-data)"
    )
    args = parser.parse_args()

    if args.package_list:
        create_tables()
        fetch_package_list()
    if args.pypi_data:
        if args.update:
            packages = Package.select().where(Package.description != '')
        else:
            packages = Package.select().where(Package.readme >> None)
        fetch_pypi_data(packages)
Ejemplo n.º 4
0
        help="how many package names to fetch"
    )
    parser.add_argument(
        '--github-readmes',
        action='store_true',
        help="fetch Github READMEs"
    )
    parser.add_argument(
        '--github-stats',
        action='store_true',
        help="fetch Github stats"
    )
    args = parser.parse_args()

    if args.package_list:
        create_tables()
        fetch_package_list()
    if args.npm_data:
        if args.update:
            packages = Package.select().where(Package.description != '')
        else:
            packages = Package.select().where(Package.readme >> None).order_by(fn.Random())
        fetch_npm_data(packages)
    if args.lib_packages:
        create_tables()
        fetch_packagenames_from_libraryio(args.lib_package_count)
    if args.github_readmes:
        fetch_github_readmes(Package.select())
    if args.github_stats:
        fetch_github_stats(Package.select())