def init_db(_): db_file = os.path.abspath( os.path.join( os.path.dirname(__file__), 'db', 'pypi.sqlite' )) DbSession.global_init(db_file)
def do_import_languages(file_data: List[dict]): imported = set() print("Importing languages ... ", flush=True) with progressbar.ProgressBar(len(file_data)) as bar: for idx, p in enumerate(file_data): info = p.get('info') classifiers = info.get('classifiers') for c in classifiers: if 'Programming Language' not in c: continue original = c c = c.replace('Implementation ::', '').replace('::', ':') text = c parts = c.split(':') if len(parts) > 1: text = ' '.join(parts[-2:]).strip().replace(' ', ' ') if text not in imported: imported.add(text) session: Session = DbSession.create() lang = ProgrammingLanguage() lang.description = original lang.id = text session.add(lang) session.commit() bar.update(idx) sys.stderr.flush() sys.stdout.flush()
def do_summary(): session = DbSession.create() print("Final numbers:") print(f"Users: {session.query(User).count():,}") print(f"Packages: {session.query(Package).count():,}") print(f"Releases: {session.query(Release).count():,}") print(f"Maintainers: {session.query(Maintainer).count():,}") print(f"Languages: {session.query(ProgrammingLanguage).count():,}") print(f"Licenses: {session.query(License).count():,}")
def do_summary(): session = DbSession.factory() print("Final numbers:") print("Users: {:,}".format(session.query(User).count())) print("Packages: {:,}".format(session.query(Package).count())) print("Releases: {:,}".format(session.query(Release).count())) print("Maintainers: {:,}".format(session.query(Maintainer).count())) print("Languages: {:,}".format(session.query(ProgrammingLanguage).count())) print("Licenses: {:,}".format(session.query(License).count()))
def load_package(data: dict, user_lookup: Dict[str, User]): try: info = data.get('info', {}) p = Package() p.id = data.get('package_name', '').strip() if not p.id: return p.author = info.get('author') p.author_email = info.get('author_email') releases = build_releases(p.id, data.get("releases", {})) if releases: p.created_date = releases[0].created_date maintainers_lookup = get_email_and_name_from_text( info.get('maintainer'), info.get('maintainer_email')) maintainers = [] for email, name in maintainers_lookup.items(): user = user_lookup.get(email) if not user: continue m = Maintainer() m.package_id = p.id m.user_id = user.id maintainers.append(m) p.summary = info.get('summary') p.description = info.get('description') p.home_page = info.get('home_page') p.docs_url = info.get('docs_url') p.package_url = info.get('package_url') p.author = info.get('author') p.author_email = info.get('author_email') p.license = detect_license(info.get('license')) session = DbSession.create() session.add(p) session.add_all(releases) if maintainers: session.add_all(maintainers) session.commit() session.close() except OverflowError: # What the heck, people just putting fake data in here # Size is terabytes... pass except Exception: raise
def do_user_import(user_lookup: Dict[str, str]) -> Dict[str, User]: print("Importing users ... ", flush=True) with progressbar.ProgressBar(max_value=len(user_lookup)) as bar: for idx, (email, name) in enumerate(user_lookup.items()): session: Session = DbSession.create() session.expire_on_commit = False user = User() user.email = email user.name = name session.add(user) session.commit() bar.update(idx) print() sys.stderr.flush() sys.stdout.flush() session: Session = DbSession.create() return {u.email: u for u in session.query(User)}
def main(): init_db() session = DbSession.create() user_count = session.query(User).count() session.close() if user_count == 0: file_data = do_load_files() users = find_users(file_data) db_users = do_user_import(users) do_import_packages(file_data, db_users) do_import_languages(file_data) do_import_licenses(file_data) do_summary()
def do_import_licenses(file_data: List[dict]): imported = set() print("Importing licenses ... ", flush=True) with progressbar.ProgressBar(max_value=len(file_data)) as bar: for idx, p in enumerate(file_data): info = p.get('info') license_text = detect_license(info.get('license')) if license_text and license_text not in imported: imported.add(license_text) session: Session = DbSession.create() package_license = License() package_license.id = license_text package_license.description = info.get('license') session.add(package_license) session.commit() bar.update(idx) sys.stderr.flush() sys.stdout.flush()
def init_db(log: logbook.Logger): db_file = os.path.abspath( os.path.join(os.path.dirname(__file__), 'db', 'pypi.sqlite')) DbSession.global_init(db_file) log.notice('DB initialized.')
def init_db(): top_folder = os.path.dirname(pypi.__file__) rel_file = os.path.join('db', 'pypi.sqlite') db_file = os.path.join(top_folder, rel_file) DbSession.global_init(db_file)