Example #1
0
def fixtures():
    db.init_db_engine(config.SQLALCHEMY_DATABASE_URI)

    sources = os.path.join("fixtures", "sources")
    with open(sources) as fp:
        r = csv.DictReader(fp)
        for line in r:
            if not data.load_source(**line):
                data.add_source(**line)

    scrapers = os.path.join("fixtures", "scrapers")
    with open(scrapers) as fp:
        r = csv.DictReader(fp)
        for line in r:
            sname = line.pop("source")
            source = data.load_source(sname)
            line["source"] = source
            data.add_scraper(**line)
Example #2
0
File: manage.py Project: MTG/metadb
def fixtures():
    db.init_db_engine(config.SQLALCHEMY_DATABASE_URI)

    sources = os.path.join("fixtures", "sources")
    with open(sources) as fp:
        r = csv.DictReader(fp)
        for line in r:
            if not data.load_source(**line):
                data.add_source(**line)

    scrapers = os.path.join("fixtures", "scrapers")
    with open(scrapers) as fp:
        r = csv.DictReader(fp)
        for line in r:
            sname = line.pop("source")
            source = data.load_source(sname)
            line["source"] = source
            data.add_scraper(**line)
Example #3
0
def main():
    db.init_db_engine(config.SQLALCHEMY_DATABASE_URI)
    source = data.load_source("musicbrainz")
    scraper = data.load_latest_scraper_for_source(source)
    print(scraper)

    mbids = get_mbids()
    total = len(mbids)

    starttime = time.time()
    done = 0
    for mblist in util.chunks(mbids, 100):
        lookup(mblist, scraper)
        done += 100
        durdelta, remdelta = util.stats(done, total, starttime)
        log.info("Done %s/%s in %s; %s remaining", done, total, str(durdelta), str(remdelta))
def main():
    db.init_db_engine(config.SQLALCHEMY_DATABASE_URI)
    source = data.load_source("musicbrainz")
    scraper = data.load_latest_scraper_for_source(source)
    print(scraper)

    mbids = get_mbids()
    total = len(mbids)

    starttime = time.time()
    done = 0
    for mblist in util.chunks(mbids, 100):
        lookup(mblist, scraper)
        done += 100
        durdelta, remdelta = util.stats(done, total, starttime)
        log.info("Done %s/%s in %s; %s remaining", done, total, str(durdelta),
                 str(remdelta))
Example #5
0
def scrape_musicbrainz(recording_mbid):
    """

    :param recording_mbid:
    :return:
    """

    source = data.load_source("musicbrainz")
    scrapers = data.load_scrapers_for_source(source)
    if scrapers:
        s = scrapers[0]
        s_obj = metadb.scrapers.create_scraper_object(s)
        if s_obj:
            s_obj.config()
            result = s_obj.scrape({"mbid": recording_mbid})
            if result:
                data.add_item(s, recording_mbid, data=result)
def get_mbids():
    source = data.load_source("musicbrainz")
    scraper = data.load_latest_scraper_for_source(source)

    existing = text("""
        SELECT mbid::text
          FROM item
         WHERE scraper_id = :scraper_id""")

    q = "SELECT mbid::text FROM recording"
    with db.engine.connect() as connection:
        res = connection.execute(existing, {"scraper_id": scraper["id"]})
        existing = [r[0] for r in res]
        log.info("got %s existing items", len(existing))
        existing = set(existing)
        res = connection.execute(q)
        remaining = [r[0] for r in res if r[0] not in existing]
        log.info("remaining %s", len(remaining))
        return remaining
Example #7
0
def get_mbids():
    source = data.load_source("musicbrainz")
    scraper = data.load_latest_scraper_for_source(source)

    existing = text("""
        SELECT mbid::text
          FROM item
         WHERE scraper_id = :scraper_id""")


    q = "SELECT mbid::text FROM recording"
    with db.engine.connect() as connection:
        res = connection.execute(existing, {"scraper_id": scraper["id"]})
        existing = [r[0] for r in res]
        log.info("got %s existing items", len(existing))
        existing = set(existing)
        res = connection.execute(q)
        remaining = [r[0] for r in res if r[0] not in existing]
        log.info("remaining %s", len(remaining))
        return remaining
Example #8
0
def main():
    db.init_db_engine(config.SQLALCHEMY_DATABASE_URI)
    source = data.load_source("musicbrainz")
    scraper = data.load_latest_scraper_for_source(source)

    recordings = get_recordings()
    total = len(recordings)
    done = 0
    starttime = time.time()
    log.info("starting..., %s recordings to process", total)

    for reclist in util.chunks(recordings, 10000):
        log.info("have %s recordings", len(reclist))
        with db.engine.connect() as connection:
            saveddata = get_data(connection, scraper["id"], reclist)
            log.info(" - got %s rows matching them", len(saveddata))
            process(connection, saveddata)
        done += len(reclist)
        durdelta, remdelta = util.stats(done, total, starttime)
        log.info("Done %s/%s in %s; %s remaining", done, total, str(durdelta), str(remdelta))
def main():
    db.init_db_engine(config.SQLALCHEMY_DATABASE_URI)
    source = data.load_source("musicbrainz")
    scraper = data.load_latest_scraper_for_source(source)

    recordings = get_recordings()
    total = len(recordings)
    done = 0
    starttime = time.time()
    log.info("starting..., %s recordings to process", total)

    for reclist in util.chunks(recordings, 10000):
        log.info("have %s recordings", len(reclist))
        with db.engine.connect() as connection:
            saveddata = get_data(connection, scraper["id"], reclist)
            log.info(" - got %s rows matching them", len(saveddata))
            process(connection, saveddata)
        done += len(reclist)
        durdelta, remdelta = util.stats(done, total, starttime)
        log.info("Done %s/%s in %s; %s remaining", done, total, str(durdelta),
                 str(remdelta))