def fixtures(): db.init_db_engine(config.SQLALCHEMY_DATABASE_URI) sources = os.path.join("fixtures", "sources") with open(sources) as fp: r = csv.DictReader(fp) for line in r: if not data.load_source(**line): data.add_source(**line) scrapers = os.path.join("fixtures", "scrapers") with open(scrapers) as fp: r = csv.DictReader(fp) for line in r: sname = line.pop("source") source = data.load_source(sname) line["source"] = source data.add_scraper(**line)
def main(): db.init_db_engine(config.SQLALCHEMY_DATABASE_URI) source = data.load_source("musicbrainz") scraper = data.load_latest_scraper_for_source(source) print(scraper) mbids = get_mbids() total = len(mbids) starttime = time.time() done = 0 for mblist in util.chunks(mbids, 100): lookup(mblist, scraper) done += 100 durdelta, remdelta = util.stats(done, total, starttime) log.info("Done %s/%s in %s; %s remaining", done, total, str(durdelta), str(remdelta))
def scrape_musicbrainz(recording_mbid): """ :param recording_mbid: :return: """ source = data.load_source("musicbrainz") scrapers = data.load_scrapers_for_source(source) if scrapers: s = scrapers[0] s_obj = metadb.scrapers.create_scraper_object(s) if s_obj: s_obj.config() result = s_obj.scrape({"mbid": recording_mbid}) if result: data.add_item(s, recording_mbid, data=result)
def get_mbids(): source = data.load_source("musicbrainz") scraper = data.load_latest_scraper_for_source(source) existing = text(""" SELECT mbid::text FROM item WHERE scraper_id = :scraper_id""") q = "SELECT mbid::text FROM recording" with db.engine.connect() as connection: res = connection.execute(existing, {"scraper_id": scraper["id"]}) existing = [r[0] for r in res] log.info("got %s existing items", len(existing)) existing = set(existing) res = connection.execute(q) remaining = [r[0] for r in res if r[0] not in existing] log.info("remaining %s", len(remaining)) return remaining
def main(): db.init_db_engine(config.SQLALCHEMY_DATABASE_URI) source = data.load_source("musicbrainz") scraper = data.load_latest_scraper_for_source(source) recordings = get_recordings() total = len(recordings) done = 0 starttime = time.time() log.info("starting..., %s recordings to process", total) for reclist in util.chunks(recordings, 10000): log.info("have %s recordings", len(reclist)) with db.engine.connect() as connection: saveddata = get_data(connection, scraper["id"], reclist) log.info(" - got %s rows matching them", len(saveddata)) process(connection, saveddata) done += len(reclist) durdelta, remdelta = util.stats(done, total, starttime) log.info("Done %s/%s in %s; %s remaining", done, total, str(durdelta), str(remdelta))