def do_import_languages(file_data: List[dict]): imported = set() print("Importing languages ... ", flush=True) with progressbar.ProgressBar(max_value=len(file_data)) as bar: for idx, p in enumerate(file_data): info = p.get('info') classifiers = info.get('classifiers') for c in classifiers: if 'Programming Language' not in c: continue original = c c = c.replace('Implementation ::', '').replace('::', ':') text = c parts = c.split(':') if len(parts) > 1: text = ' '.join(parts[-2:]).strip().replace(' ', ' ') if text not in imported: imported.add(text) session: Session = DbSession.factory() lang = ProgrammingLanguage() lang.description = original lang.id = text session.add(lang) session.commit() bar.update(idx) sys.stderr.flush() sys.stdout.flush()
def insert_a_package(): p = Package() p.id = input("Package name: ") p.summary = input("Package summary: ") p.author_name = input("Author name: ") p.author_email = input("Author email: ") p.license = input("license: ") r1 = Release() print("Release 1: ") r1.major_ver = int(input('Major version:')) r1.minor_ver = int(input('Minor version:')) r1.build_ver = int(input('Build version:')) r1.size = 100_000 r2 = Release() print("Release 2: ") r2.major_ver = int(input('Major version:')) r2.minor_ver = int(input('Minor version:')) r2.build_ver = int(input('Build version:')) r2.size = 200_000 p.releases.append(r1) p.releases.append(r2) session = DbSession.factory() session.add(p) session.commit()
def latest_releases(limit=10) -> List[Package]: session: Session = DbSession.factory() releases = session.query(Release) \ .order_by(Release.created_date.desc()) \ .limit(limit * 2) packages_in_order = [r.package_id for r in releases] package_ids = set(packages_in_order) packages = { p.id: p for p in session.query(Package).filter(Package.id.in_(package_ids)) } session.close() results = [] for r in releases: if len(results) >= limit: break results.append(packages[r.package_id]) return results
def do_summary(): session = DbSession.factory() print("Final numbers:") print("Users: {:,}".format(session.query(User).count())) print("Packages: {:,}".format(session.query(Package).count())) print("Releases: {:,}".format(session.query(Release).count())) print("Maintainers: {:,}".format(session.query(Maintainer).count())) print("Languages: {:,}".format(session.query(ProgrammingLanguage).count())) print("Licenses: {:,}".format(session.query(License).count()))
def maintainers_for_packages(package_name: str) -> List[Maintainer]: session: Session = DbSession.factory() try: user_ids = [ r.user_id for r in session.query(Maintainer).filter( Maintainer.package_id == package_name) ] return list(session.query(User).filter(User.id.in_(user_ids))) finally: session.close()
def load_package(data: dict, user_lookup: Dict[str, User]): try: info = data.get('info', {}) p = Package() p.id = data.get('package_name', '').strip() if not p.id: return p.author = info.get('author') p.author_email = info.get('author_email') releases = build_releases(p.id, data.get("releases", {})) if releases: p.created_date = releases[0].created_date maintainers_lookup = get_email_and_name_from_text( info.get('maintainer'), info.get('maintainer_email')) maintainers = [] for email, name in maintainers_lookup.items(): user = user_lookup.get(email) if not user: continue m = Maintainer() m.package_id = p.id m.user_id = user.id maintainers.append(m) p.summary = info.get('summary') p.description = info.get('description') p.home_page = info.get('home_page') p.docs_url = info.get('docs_url') p.package_url = info.get('package_url') p.author = info.get('author') p.author_email = info.get('author_email') p.license = detect_license(info.get('license')) session = DbSession.factory() session.add(p) session.add_all(releases) if maintainers: session.add_all(maintainers) session.commit() session.close() except OverflowError: # What the heck, people just putting fake data in here # Size is terabytes... pass except Exception: raise
def create_user(email: str, name: str, password: str) -> User: user = User() user.name = name user.email = email.lower().strip() user.hashed_password = hash_text(password) session: Session = DbSession.factory() session.add(user) session.commit() session.close() return user
def do_user_import(user_lookup: Dict[str, str]) -> Dict[str, User]: print("Importing users ... ", flush=True) with progressbar.ProgressBar(max_value=len(user_lookup)) as bar: for idx, (email, name) in enumerate(user_lookup.items()): session: Session = DbSession.factory() session.expire_on_commit = False user = User() user.email = email user.name = name session.add(user) session.commit() bar.update(idx) print() sys.stderr.flush() sys.stdout.flush() session: Session = DbSession.factory() return {u.email: u for u in session.query(User)}
def main(): init_db() session = DbSession.factory() user_count = session.query(User).count() session.close() if user_count == 0: file_data = do_load_files() users = find_users(file_data) db_users = do_user_import(users) do_import_packages(file_data, db_users) do_import_languages(file_data) do_import_licenses(file_data) do_summary()
def find_package_by_name(package_name: str) -> Optional[Package]: session: Session = DbSession.factory() # .options(subqueryload(Package.releases)) try: package = session.query(Package) \ .filter(Package.id == package_name) \ .options(subqueryload(Package.releases)) \ .first() if package: # noinspection PyUnusedLocal devnull = package.releases return package finally: session.close()
def login_user(email: str, password: str) -> Optional[User]: if not email: return None email = email.lower().strip() session: Session = DbSession.factory() user = session.query(User).filter(User.email == email).first() session.close() if not user: return None if not verify_hash(user.hashed_password, password): return None return user
def do_import_licenses(file_data: List[dict]): imported = set() print("Importing licenses ... ", flush=True) with progressbar.ProgressBar(max_value=len(file_data)) as bar: for idx, p in enumerate(file_data): info = p.get('info') license_text = detect_license(info.get('license')) if license_text and license_text not in imported: imported.add(license_text) session: Session = DbSession.factory() package_license = License() package_license.id = license_text package_license.description = info.get('license') session.add(package_license) session.commit() bar.update(idx) sys.stderr.flush() sys.stdout.flush()
def init_db(): top_folder = os.path.dirname(pypi_vm.app.__file__) rel_file = os.path.join('db', 'pypi.sqlite') db_file = os.path.join(top_folder, rel_file) DbSession.global_init(db_file)
def find_user_by_id(user_id: int) -> Optional[User]: session: Session = DbSession.factory() try: return session.query(User).filter(User.id == user_id).first() finally: session.close()
def find_user_by_email(email: str) -> Optional[User]: session: Session = DbSession.factory() try: return session.query(User).filter(User.email == email).first() finally: session.close()
def user_count() -> int: session: Session = DbSession.factory() try: return session.query(User).count() finally: session.close()
def package_count() -> int: session: Session = DbSession.factory() try: return session.query(Package).count() finally: session.close()
def init_db(): db_file = os.path.abspath( os.path.join(os.path.dirname(__file__), 'db', 'pypi.sqlite')) DbSession.global_init(db_file)
def all_packages(limit: int) -> List[Package]: session: Session = DbSession.factory() try: return list(session.query(Package).limit(limit)) finally: session.close()
def release_count() -> int: session: Session = DbSession.factory() try: return session.query(Release).count() finally: session.close()