def main(): db.init_db_engine(config.SQLALCHEMY_DATABASE_URI) source = data.load_source("musicbrainz") scraper = data.load_latest_scraper_for_source(source) print(scraper) mbids = get_mbids() total = len(mbids) starttime = time.time() done = 0 for mblist in util.chunks(mbids, 100): lookup(mblist, scraper) done += 100 durdelta, remdelta = util.stats(done, total, starttime) log.info("Done %s/%s in %s; %s remaining", done, total, str(durdelta), str(remdelta))
def get_mbids(): source = data.load_source("musicbrainz") scraper = data.load_latest_scraper_for_source(source) existing = text(""" SELECT mbid::text FROM item WHERE scraper_id = :scraper_id""") q = "SELECT mbid::text FROM recording" with db.engine.connect() as connection: res = connection.execute(existing, {"scraper_id": scraper["id"]}) existing = [r[0] for r in res] log.info("got %s existing items", len(existing)) existing = set(existing) res = connection.execute(q) remaining = [r[0] for r in res if r[0] not in existing] log.info("remaining %s", len(remaining)) return remaining
def main(): db.init_db_engine(config.SQLALCHEMY_DATABASE_URI) source = data.load_source("musicbrainz") scraper = data.load_latest_scraper_for_source(source) recordings = get_recordings() total = len(recordings) done = 0 starttime = time.time() log.info("starting..., %s recordings to process", total) for reclist in util.chunks(recordings, 10000): log.info("have %s recordings", len(reclist)) with db.engine.connect() as connection: saveddata = get_data(connection, scraper["id"], reclist) log.info(" - got %s rows matching them", len(saveddata)) process(connection, saveddata) done += len(reclist) durdelta, remdelta = util.stats(done, total, starttime) log.info("Done %s/%s in %s; %s remaining", done, total, str(durdelta), str(remdelta))