def main(): db.init_db_engine(config.SQLALCHEMY_DATABASE_URI) source = data.load_source("musicbrainz") scraper = data.load_latest_scraper_for_source(source) print(scraper) mbids = get_mbids() total = len(mbids) starttime = time.time() done = 0 for mblist in util.chunks(mbids, 100): lookup(mblist, scraper) done += 100 durdelta, remdelta = util.stats(done, total, starttime) log.info("Done %s/%s in %s; %s remaining", done, total, str(durdelta), str(remdelta))
def process(connection, datalist): added = 0 for recmbid, item in datalist: data = item["result"] if not data: continue recording_title = data["name"] rec_artist_credit = data["artist_credit"] add_to_recmeta(connection, recmbid, recording_title, rec_artist_credit) added += 1 for rgmbid, rgdata in data["release_group_map"].items(): rgtitle = rgdata["name"] earliest_date = rgdata["first_release_date"] rgartist_credit = rgdata["artist_credit"] add_to_rg(connection, rgmbid, recmbid, rgtitle, rgartist_credit, earliest_date) log.info(" * added %s recordings", added)
def process_file(module, filename, numworkers, save=False): data = [] with open(filename) as csvfile: for query in csv.DictReader(csvfile): data.append(query) total = len(data) starttime = time.time() done = 0 CHUNK_SIZE = 1 for items in util.chunks(data, CHUNK_SIZE): process_items(items, module, save, numworkers) done += CHUNK_SIZE durdelta, remdelta = util.stats(done, total, starttime) time.sleep(random.uniform(.5, 1.5)) log.info("Done %s/%s in %s; %s remaining", done, total, str(durdelta), str(remdelta))
def get_mbids(): source = data.load_source("musicbrainz") scraper = data.load_latest_scraper_for_source(source) existing = text(""" SELECT mbid::text FROM item WHERE scraper_id = :scraper_id""") q = "SELECT mbid::text FROM recording" with db.engine.connect() as connection: res = connection.execute(existing, {"scraper_id": scraper["id"]}) existing = [r[0] for r in res] log.info("got %s existing items", len(existing)) existing = set(existing) res = connection.execute(q) remaining = [r[0] for r in res if r[0] not in existing] log.info("remaining %s", len(remaining)) return remaining
def main(): db.init_db_engine(config.SQLALCHEMY_DATABASE_URI) log.info("Release groups") releasegroups = get_rgs() fieldnames = ["mbid", "release_title", "artist", "year"] with open("release-group-meta.csv", "w") as fp: w = csv.DictWriter(fp, fieldnames=fieldnames) w.writeheader() for rg in releasegroups: w.writerow(rg) log.info("Recordings") recordings = get_recordings() fieldnames = ["mbid", "recording", "artist"] count = (len(recordings) // 8) + 1 for i, reclist in enumerate(util.chunks(recordings, count), 1): with open("recording-meta-{}.csv".format(i), "w") as fp: w = csv.DictWriter(fp, fieldnames=fieldnames) w.writeheader() for rec in reclist: w.writerow(rec)
def main(): db.init_db_engine(config.SQLALCHEMY_DATABASE_URI) log.info("Release groups") releasegroups = get_rgs() fieldnames = ["mbid", "release_title", "artist", "year"] with open("release-group-meta.csv", "w") as fp: w = csv.DictWriter(fp, fieldnames=fieldnames) w.writeheader() for rg in releasegroups: w.writerow(rg) log.info("Recordings") recordings = get_recordings() fieldnames = ["mbid", "recording", "artist"] count = (len(recordings)//8) + 1 for i, reclist in enumerate(util.chunks(recordings, count), 1): with open("recording-meta-{}.csv".format(i), "w") as fp: w = csv.DictWriter(fp, fieldnames=fieldnames) w.writeheader() for rec in reclist: w.writerow(rec)
def main(): db.init_db_engine(config.SQLALCHEMY_DATABASE_URI) source = data.load_source("musicbrainz") scraper = data.load_latest_scraper_for_source(source) recordings = get_recordings() total = len(recordings) done = 0 starttime = time.time() log.info("starting..., %s recordings to process", total) for reclist in util.chunks(recordings, 10000): log.info("have %s recordings", len(reclist)) with db.engine.connect() as connection: saveddata = get_data(connection, scraper["id"], reclist) log.info(" - got %s rows matching them", len(saveddata)) process(connection, saveddata) done += len(reclist) durdelta, remdelta = util.stats(done, total, starttime) log.info("Done %s/%s in %s; %s remaining", done, total, str(durdelta), str(remdelta))