Esempio n. 1
0
def init_db(_):
    db_file = os.path.abspath(
        os.path.join(
            os.path.dirname(__file__),
            'db',
            'pypi.sqlite'
        ))
    DbSession.global_init(db_file)
def do_import_languages(file_data: List[dict]):
    imported = set()
    print("Importing languages ... ", flush=True)
    with progressbar.ProgressBar(len(file_data)) as bar:
        for idx, p in enumerate(file_data):
            info = p.get('info')
            classifiers = info.get('classifiers')
            for c in classifiers:
                if 'Programming Language' not in c:
                    continue

                original = c

                c = c.replace('Implementation ::', '').replace('::', ':')
                text = c
                parts = c.split(':')
                if len(parts) > 1:
                    text = ' '.join(parts[-2:]).strip().replace('  ', ' ')

                if text not in imported:
                    imported.add(text)
                    session: Session = DbSession.create()

                    lang = ProgrammingLanguage()
                    lang.description = original
                    lang.id = text
                    session.add(lang)
                    session.commit()

            bar.update(idx)

    sys.stderr.flush()
    sys.stdout.flush()
def do_summary():
    session = DbSession.create()

    print("Final numbers:")
    print(f"Users: {session.query(User).count():,}")
    print(f"Packages: {session.query(Package).count():,}")
    print(f"Releases: {session.query(Release).count():,}")
    print(f"Maintainers: {session.query(Maintainer).count():,}")
    print(f"Languages: {session.query(ProgrammingLanguage).count():,}")
    print(f"Licenses: {session.query(License).count():,}")
Esempio n. 4
0
def do_summary():
    session = DbSession.factory()

    print("Final numbers:")
    print("Users: {:,}".format(session.query(User).count()))
    print("Packages: {:,}".format(session.query(Package).count()))
    print("Releases: {:,}".format(session.query(Release).count()))
    print("Maintainers: {:,}".format(session.query(Maintainer).count()))
    print("Languages: {:,}".format(session.query(ProgrammingLanguage).count()))
    print("Licenses: {:,}".format(session.query(License).count()))
def load_package(data: dict, user_lookup: Dict[str, User]):
    try:
        info = data.get('info', {})

        p = Package()
        p.id = data.get('package_name', '').strip()
        if not p.id:
            return

        p.author = info.get('author')
        p.author_email = info.get('author_email')

        releases = build_releases(p.id, data.get("releases", {}))

        if releases:
            p.created_date = releases[0].created_date

        maintainers_lookup = get_email_and_name_from_text(
            info.get('maintainer'), info.get('maintainer_email'))
        maintainers = []
        for email, name in maintainers_lookup.items():
            user = user_lookup.get(email)
            if not user:
                continue

            m = Maintainer()
            m.package_id = p.id
            m.user_id = user.id
            maintainers.append(m)

        p.summary = info.get('summary')
        p.description = info.get('description')

        p.home_page = info.get('home_page')
        p.docs_url = info.get('docs_url')
        p.package_url = info.get('package_url')

        p.author = info.get('author')
        p.author_email = info.get('author_email')
        p.license = detect_license(info.get('license'))

        session = DbSession.create()
        session.add(p)
        session.add_all(releases)
        if maintainers:
            session.add_all(maintainers)
        session.commit()
        session.close()
    except OverflowError:
        # What the heck, people just putting fake data in here
        # Size is terabytes...
        pass
    except Exception:
        raise
def do_user_import(user_lookup: Dict[str, str]) -> Dict[str, User]:
    print("Importing users ... ", flush=True)
    with progressbar.ProgressBar(max_value=len(user_lookup)) as bar:
        for idx, (email, name) in enumerate(user_lookup.items()):
            session: Session = DbSession.create()
            session.expire_on_commit = False

            user = User()
            user.email = email
            user.name = name
            session.add(user)

            session.commit()
            bar.update(idx)

    print()
    sys.stderr.flush()
    sys.stdout.flush()

    session: Session = DbSession.create()
    return {u.email: u for u in session.query(User)}
def main():
    init_db()
    session = DbSession.create()
    user_count = session.query(User).count()
    session.close()
    if user_count == 0:
        file_data = do_load_files()
        users = find_users(file_data)

        db_users = do_user_import(users)
        do_import_packages(file_data, db_users)

        do_import_languages(file_data)
        do_import_licenses(file_data)

    do_summary()
def do_import_licenses(file_data: List[dict]):
    imported = set()
    print("Importing licenses ... ", flush=True)
    with progressbar.ProgressBar(max_value=len(file_data)) as bar:
        for idx, p in enumerate(file_data):
            info = p.get('info')
            license_text = detect_license(info.get('license'))

            if license_text and license_text not in imported:
                imported.add(license_text)
                session: Session = DbSession.create()

                package_license = License()
                package_license.id = license_text
                package_license.description = info.get('license')

                session.add(package_license)
                session.commit()

            bar.update(idx)

    sys.stderr.flush()
    sys.stdout.flush()
def init_db(log: logbook.Logger):
    db_file = os.path.abspath(
        os.path.join(os.path.dirname(__file__), 'db', 'pypi.sqlite'))
    DbSession.global_init(db_file)

    log.notice('DB initialized.')
def init_db():
    top_folder = os.path.dirname(pypi.__file__)
    rel_file = os.path.join('db', 'pypi.sqlite')
    db_file = os.path.join(top_folder, rel_file)
    DbSession.global_init(db_file)