def populate(): session = DBSession() root = Root(name=u'PyPI') session.add(root) client = xmlrpclib.ServerProxy('http://pypi.python.org/pypi') packages = client.list_packages() # Do it in parallel to go faster results = pool.map(ingest_package, packages) for i, result in enumerate(results): package = result['name'] print "Populating DB with:", i, package # Query for it first... if Package.query.filter_by(name=package).count() > 0: print "Package '%s' is already in the DB. Skipping." % package continue p = Package(name=package, root=root) session.add(p) for release_data in result['releases']: release = release_data['name'] data = release_data['data'] r = Release( name=release, package=p, summary=data.get('summary', '') ) for classifier in data['classifiers']: query = Classifier.query.filter_by(name=classifier) if query.count() == 0: k = Classifier(name=classifier) session.add(k) k = Classifier.query.filter_by(name=classifier).one() r.classifiers.append(k) for keyword in (data['keywords'] or '').split(): query = Keyword.query.filter_by(name=keyword) if query.count() == 0: k = Keyword(name=keyword) session.add(k) k = Keyword.query.filter_by(name=keyword).one() r.keywords.append(k) if 'maintainer' in data: query = Maintainer.query.filter_by(name=data['maintainer']) if query.count() == 0: a = Maintainer(name=data['maintainer'], email=data.get('maintainer_email')) session.add(a) a = Maintainer.query.filter_by(name=data['maintainer']).one() r.maintainer = a if 'author' in data: query = Author.query.filter_by(name=data['author']) if query.count() == 0: a = Author(name=data['author'], email=data.get('author_email')) session.add(a) a = Author.query.filter_by(name=data['author']).one() r.author = a if 'license' in data: query = License.query.filter_by(name=data['license']) if query.count() == 0: l = License(name=data['license']) session.add(l) l = License.query.filter_by(name=data['license']).one() r.license = l session.add(r) session.commit()