Ejemplo n.º 1
0
def main(verbose=False):
    download_if_modified(bbc_sitemap_url, bbc_sitemap)

    db = db_connect()

    release_redirects = dict(get_release_redirects(db))
    release_groups = dict(get_release_groups(db))
    releases = dict(get_releases(db))
    bbc_reviews_set = set((gid, url) for gid, url in db.execute("""SELECT gid, url FROM bot_bbc_reviews_set"""))

    review_urls = defaultdict(set)
    for rg, url in get_review_urls(db):
        review_urls[rg].add(url)

    cleanup_review_urls = set()
    for cleanup_url in cleanup_urls:
        f = urllib.urlopen(cleanup_url)
        cleanup_review_urls |= set(re.findall(ur"http://www.bbc.co.uk/music/reviews/[0-9a-z]+", f.read()))

    editor_id = db.execute("""SELECT id FROM editor WHERE name = %s""", cfg.MB_USERNAME).first()[0]
    mb = MusicBrainzClient(cfg.MB_USERNAME, cfg.MB_PASSWORD, cfg.MB_SITE, editor_id=editor_id)

    normal_edits_left, edits_left = mb.edits_left()

    bbc_reviews = list(load_bbc_reviews(bbc_sitemap))
    count = len(bbc_reviews)
    for i, (review_url, release_url, title) in enumerate(bbc_reviews):
        if normal_edits_left <= 0:
            break
        if verbose:
            out(u"%d/%d - %.2f%%" % (i + 1, count, (i + 1) * 100.0 / count))
            out(u"%s %s" % (title, review_url))
            out(release_url)
        if review_url in cleanup_review_urls:
            continue
        release_gid = utils.extract_mbid(release_url, "release")
        row = release_redirects.get(release_gid)
        if not row:
            row = releases.get(release_gid)
        if not row:
            if verbose:
                out("  non-existant release in review %s" % review_url)
            continue
        rg, ac, release_name = row
        gid, name = release_groups[rg]
        if review_url in review_urls[rg]:
            continue
        if (gid, review_url) in bbc_reviews_set:
            if verbose:
                out(u"  already linked earlier (probably got removed by some editor!")
            continue
        mb_title = "%s - %s" % (artist_credit(db, ac), release_name)
        if not are_similar(title, mb_title):
            if verbose:
                out(u"  similarity too small: %s <-> %s" % (title, mb_title))
                # out(u'|-\n| [%s %s]\n| [[ReleaseGroup:%s|%s]]\n| [[Release:%s|%s]]' % (review_url, bbc_name, gid, name, release_gid, release_name))
            continue
        text = (
            u"Review is in BBC mapping [1], and review name “%s” is"
            " similar to the release name. If this is wrong,"
            " please note it here and put the correct mapping in"
            " the wiki [2].\n\n[1] %s\n[2] %s" % (title, bbc_sitemap_url, cleanup_urls[0])
        )
        text += "\n\n%s" % prog
        try:
            out(u"http://musicbrainz.org/release-group/%s  ->  %s" % (gid, review_url))
            mb.add_url("release_group", gid, 94, review_url, text, auto=False)
            db.execute("INSERT INTO bot_bbc_reviews_set (gid,url) VALUES (%s,%s)", (gid, review_url))
            bbc_reviews_set.add((gid, review_url))
            normal_edits_left -= 1
        except (urllib2.HTTPError, urllib2.URLError, socket.timeout) as e:
            out(e)