def fetch_npm_package_list(): res = make_request( default_requests_session.get, "https://skimdb.npmjs.com/registry/_all_docs", ) if res is not None: packages = res.json()['rows'] for p in packages: NPMPackage.get_or_create(name=p['id'])
def npm_analysis(): logging.info("Starting NPM package analysis.") for p in NPMPackage.select().where(NPMPackage.readme != ''): readme_text = p.readme html = markdown.markdown(readme_text) soup = BeautifulSoup(html, 'html.parser') # This is a heuristic for word-count. # It will be not be precisely correct, depending on your definition of word. # For example, a path like 'com.app.example' is split into three words here. word_count = len(re.findall('\w+', soup.text)) # Another heuristic. As it's typical that inline code examples occur in <pre> # blocks, especially in formatted markdown, we count code blocks based # on the appearance of <pre> tags. code_blocks = soup.find_all('pre') block_count = len(code_blocks) try: analysis = NPMReadmeAnalysis.get(NPMReadmeAnalysis.package == p) except NPMReadmeAnalysis.DoesNotExist: analysis = NPMReadmeAnalysis.create( package=p, code_count=block_count, word_count=word_count ) logging.debug("Created README analysis for package %s", p.name) else: analysis.code_count = block_count analysis.word_count = word_count analysis.save() logging.debug("Updated README analysis for package %s", p.name)
help="how many package names to fetch (only applicable to npm)") parser.add_argument('--github-readmes', action='store_true', help="fetch Github READMEs (only applicable to npm)") parser.add_argument('--github-stats', action='store_true', help="fetch Github stats (only applicable to npm)") args = parser.parse_args() if args.db == 'npm': if args.package_list: create_npm_tables() fetch_npm_package_list() if args.data: if args.update: packages = NPMPackage.select().where( NPMPackage.description != '') else: packages = NPMPackage.select().where( NPMPackage.readme >> None).order_by(fn.Random()) fetch_npm_data(packages) if args.lib_packages: create_tables() fetch_packagenames_from_libraryio(args.lib_package_count) if args.github_readmes: fetch_github_readmes(NPMPackage.select()) if args.github_stats: fetch_github_stats(NPMPackage.select()) elif args.db == 'pypi': if args.package_list: create_pypi_tables() fetch_pypi_package_list()
) parser.add_argument( '--github-stats', action='store_true', help="fetch Github stats (only applicable to npm)" ) args = parser.parse_args() if args.db == 'npm': if args.package_list: create_npm_tables() fetch_npm_package_list() if args.data: if args.update: packages = NPMPackage.select().where(NPMPackage.description != '') else: packages = NPMPackage.select().where(NPMPackage.readme >> None).order_by(fn.Random()) fetch_npm_data(packages) if args.lib_packages: create_tables() fetch_packagenames_from_libraryio(args.lib_package_count) if args.github_readmes: fetch_github_readmes(NPMPackage.select()) if args.github_stats: fetch_github_stats(NPMPackage.select()) elif args.db == 'pypi': if args.package_list: create_pypi_tables() fetch_pypi_package_list() if args.data: