found_albums = [] albums = set([r[0] for r in db.execute(query_artist_albums, (id, id))]) albums_to_ignore = set() for album in albums: if mangle_name(name) in mangle_name(album): albums_to_ignore.add(album) albums -= albums_to_ignore if not albums: continue for album in albums: mangled_album = mangle_name(album) if len(mangled_album) > 4 and mangled_album in page: found_albums.append(album) ratio = len(found_albums) * 1.0 / len(albums) print ' * ratio: %s, has albums: %s, found albums: %s' % (ratio, len(albums), len(found_albums)) min_ratio = 0.2 if len(found_albums) < 2: continue #if ratio < min_ratio: # continue url = 'http://ko.wikipedia.org/wiki/%s' % (quote_page_title(page_title),) text = 'Matched based on the name. The page mentions %s.' % (join_names('album', found_albums),) print ' * linking to %s' % (url,) print ' * edit note: %s' % (text,) mb.add_url("artist", gid, 179, url, text) break db.execute("INSERT INTO bot_wp_artist_ko (gid) VALUES (%s)", (gid,)) print processed, skipped
if "disambiguationpages" in page: print " * disambiguation or album page, skipping" continue if "recordlabels" not in page: print " * not a record label page, skipping" continue page_title = pages[0]["title"] print ' * trying article "%s"' % (page_title,) artists = set([r[0] for r in db.execute(query_label_artists, (id,))]) if name in artists: artists.remove(name) if not artists: continue found_artists = [] for artist in artists: mangled_artist = mangle_name(artist) if len(mangled_artist) > 5 and mangled_artist in page: found_artists.append(artist) ratio = len(found_artists) * 1.0 / len(artists) print " * ratio: %s, has artists: %s, found artists: %s" % (ratio, len(artists), len(found_artists)) if len(found_artists) < 2: continue url = "https://en.wikipedia.org/wiki/%s" % (quote_page_title(page_title),) text = "Matched based on the name. The page mentions %s." % (join_names("artist", found_artists),) print " * linking to %s" % (url,) print " * edit note: %s" % (text,) time.sleep(60) mb.add_url("label", gid, 216, url, text) break db.execute("INSERT INTO bot_wp_label (gid) VALUES (%s)", (gid,))
def main(verbose=False): download_if_modified(bbc_sitemap_url, bbc_sitemap) db = db_connect() release_redirects = dict(get_release_redirects(db)) release_groups = dict(get_release_groups(db)) releases = dict(get_releases(db)) bbc_reviews_set = set((gid, url) for gid, url in db.execute("""SELECT gid, url FROM bot_bbc_reviews_set""")) review_urls = defaultdict(set) for rg, url in get_review_urls(db): review_urls[rg].add(url) cleanup_review_urls = set() for cleanup_url in cleanup_urls: f = urllib.urlopen(cleanup_url) cleanup_review_urls |= set(re.findall(ur"http://www.bbc.co.uk/music/reviews/[0-9a-z]+", f.read())) editor_id = db.execute("""SELECT id FROM editor WHERE name = %s""", cfg.MB_USERNAME).first()[0] mb = MusicBrainzClient(cfg.MB_USERNAME, cfg.MB_PASSWORD, cfg.MB_SITE, editor_id=editor_id) normal_edits_left, edits_left = mb.edits_left() bbc_reviews = list(load_bbc_reviews(bbc_sitemap)) count = len(bbc_reviews) for i, (review_url, release_url, title) in enumerate(bbc_reviews): if normal_edits_left <= 0: break if verbose: out(u"%d/%d - %.2f%%" % (i + 1, count, (i + 1) * 100.0 / count)) out(u"%s %s" % (title, review_url)) out(release_url) if review_url in cleanup_review_urls: continue release_gid = utils.extract_mbid(release_url, "release") row = release_redirects.get(release_gid) if not row: row = releases.get(release_gid) if not row: if verbose: out(" non-existant release in review %s" % review_url) continue rg, ac, release_name = row gid, name = release_groups[rg] if review_url in review_urls[rg]: continue if (gid, review_url) in bbc_reviews_set: if verbose: out(u" already linked earlier (probably got removed by some editor!") continue mb_title = "%s - %s" % (artist_credit(db, ac), release_name) if not are_similar(title, mb_title): if verbose: out(u" similarity too small: %s <-> %s" % (title, mb_title)) # out(u'|-\n| [%s %s]\n| [[ReleaseGroup:%s|%s]]\n| [[Release:%s|%s]]' % (review_url, bbc_name, gid, name, release_gid, release_name)) continue text = ( u"Review is in BBC mapping [1], and review name “%s” is" " similar to the release name. If this is wrong," " please note it here and put the correct mapping in" " the wiki [2].\n\n[1] %s\n[2] %s" % (title, bbc_sitemap_url, cleanup_urls[0]) ) text += "\n\n%s" % prog try: out(u"http://musicbrainz.org/release-group/%s -> %s" % (gid, review_url)) mb.add_url("release_group", gid, 94, review_url, text, auto=False) db.execute("INSERT INTO bot_bbc_reviews_set (gid,url) VALUES (%s,%s)", (gid, review_url)) bbc_reviews_set.add((gid, review_url)) normal_edits_left -= 1 except (urllib2.HTTPError, urllib2.URLError, socket.timeout) as e: out(e)
if (found_artists): reasons.append(join_names('related artist', found_artists)) out(' * has related artists: %s, found related artists: %s' % (len(artists), len(found_artists))) # Determine if artist matches if not found_albums and not found_works and not found_artists and not found_urls: continue # Check if wikipedia lang is compatible with artist country if wp_lang != 'en' or wp_lang in acceptable_countries_for_lang: if wp_lang not in acceptable_countries_for_lang: continue country, country_reasons = determine_country(wikipage) if (country not in acceptable_countries_for_lang[wp_lang]): colored_out(bcolors.HEADER, ' * artist country (%s) not compatible with wiki language (%s)' % (country, wp_lang)) continue wp_url = 'https://%s.wikipedia.org/wiki/%s' % (wp_lang, quote_page_title(page_title),) wd_url = 'https://www.wikidata.org/wiki/%s' % wikipage.wikidata_id.upper() text = 'Wikidata identifier found from matching Wikipedia page %s. The page mentions %s.' % (wp_url, ', '.join(reasons)) colored_out(bcolors.OKGREEN, ' * linking to %s' % (wd_url,)) out(' * edit note: %s' % (text,)) time.sleep(60) mb.add_url("artist", artist['gid'], 352, wd_url, text) break if artist['processed'] is None: db.execute("INSERT INTO bot_wp_artist_link (gid, lang) VALUES (%s, %s)", (artist['gid'], wp_lang)) else: db.execute("UPDATE bot_wp_artist_link SET processed = now() WHERE (gid, lang) = (%s, %s)", (artist['gid'], wp_lang))
country, country_reasons = determine_country(wikipage) if (country not in acceptable_countries_for_lang[wp_lang]): colored_out( bcolors.HEADER, ' * artist country (%s) not compatible with wiki language (%s)' % (country, wp_lang)) continue wp_url = 'http://%s.wikipedia.org/wiki/%s' % ( wp_lang, quote_page_title(page_title), ) wd_url = 'http://www.wikidata.org/wiki/%s' % wikipage.wikidata_id.upper( ) text = 'Wikidata identifier found from matching Wikipedia page %s. The page mentions %s.' % ( wp_url, ', '.join(reasons)) colored_out(bcolors.OKGREEN, ' * linking to %s' % (wd_url, )) out(' * edit note: %s' % (text, )) time.sleep(60) mb.add_url("artist", artist['gid'], 352, wd_url, text) break if artist['processed'] is None: db.execute( "INSERT INTO bot_wp_artist_link (gid, lang) VALUES (%s, %s)", (artist['gid'], wp_lang)) else: db.execute( "UPDATE bot_wp_artist_link SET processed = now() WHERE (gid, lang) = (%s, %s)", (artist['gid'], wp_lang))
continue found_tracks = [] tracks = set([r[0] for r in db.execute(query_album_tracks, (rg_id,))]) tracks_to_ignore = set() for track in tracks: mangled_track = mangle_name(track) if len(mangled_track) <= 4 or mangle_name(rg_name) in mangle_name(track): tracks_to_ignore.add(track) tracks -= tracks_to_ignore if len(tracks) < 5: continue for track in tracks: mangled_track = mangle_name(track) if len(mangled_track) > 4 and mangled_track in page: found_tracks.append(track) else: out(" * track %s not found" % (track,)) ratio = len(found_tracks) * 1.0 / len(tracks) out(' * ratio: %s, has tracks: %s, found tracks: %s' % (ratio, len(tracks), len(found_tracks))) min_ratio = 0.8 if len(rg_name) > 4 else 1.0 if ratio < min_ratio: continue text = 'Matched based on the name. The page mentions artist "%s" and %s.' % (ac_name, join_names('track', found_tracks),) out(' * linking to %s' % (url,)) out(' * edit note: %s' % (text,)) time.sleep(30) mb.add_url("release_group", rg_gid, 89, url, text) break db.execute("INSERT INTO bot_wp_rg (gid) VALUES (%s)", (rg_gid,))
if 'recordlabels' not in page: print ' * not a record label page, skipping' continue page_title = pages[0]['title'] print ' * trying article "%s"' % (page_title, ) artists = set([r[0] for r in db.execute(query_label_artists, (id, ))]) if name in artists: artists.remove(name) if not artists: continue found_artists = [] for artist in artists: mangled_artist = mangle_name(artist) if len(mangled_artist) > 5 and mangled_artist in page: found_artists.append(artist) ratio = len(found_artists) * 1.0 / len(artists) print ' * ratio: %s, has artists: %s, found artists: %s' % ( ratio, len(artists), len(found_artists)) if len(found_artists) < 2: continue url = 'https://en.wikipedia.org/wiki/%s' % ( quote_page_title(page_title), ) text = 'Matched based on the name. The page mentions %s.' % ( join_names('artist', found_artists), ) print ' * linking to %s' % (url, ) print ' * edit note: %s' % (text, ) time.sleep(60) mb.add_url("label", gid, 216, url, text) break db.execute("INSERT INTO bot_wp_label (gid) VALUES (%s)", (gid, ))
if wp_lang not in acceptable_countries_for_lang: continue country, country_reasons = determine_country(wikipage) if (country not in acceptable_countries_for_lang[wp_lang]): colored_out( bcolors.HEADER, ' * artist country (%s) not compatible with wiki language (%s)' % (country, wp_lang)) continue url = 'http://%s.wikipedia.org/wiki/%s' % ( wp_lang, quote_page_title(page_title), ) text = 'Matched based on the name. The page mentions %s.' % ( ', '.join(reasons), ) colored_out(bcolors.OKGREEN, ' * linking to %s' % (url, )) out(' * edit note: %s' % (text, )) time.sleep(60) mb.add_url("artist", artist['gid'], 179, url, text) break if artist['processed'] is None: db.execute( "INSERT INTO bot_wp_artist_link (gid, lang) VALUES (%s, %s)", (artist['gid'], wp_lang)) else: db.execute( "UPDATE bot_wp_artist_link SET processed = now() WHERE (gid, lang) = (%s, %s)", (artist['gid'], wp_lang))
for i, (rg, gid, name) in enumerate(itertools.chain(*rg_grouped)): urls = set(u[0] for u in db.execute(query_rg_release_discogs, rg)) if len(urls) < 2: continue out(u'%d/%d - %.2f%%' % (i, count, i * 100.0 / count)) out(u'%s http://musicbrainz.org/release-group/%s' % (name, gid)) masters = list(discogs_get_master(urls)) if len(masters) == 0: out(u' aborting, no Discogs master!') continue if len(set(masters)) > 1: out(u' aborting, releases with different Discogs master in one group!' ) continue if len(masters) != len(urls): out(u' aborting, releases without Discogs master in group!') continue master_name, master_id, master_artists = masters[0] ratio = Levenshtein.ratio(master_name.lower(), name.lower()) if ratio < 0.8: out(u' Similarity ratio too small: %.2f' % ratio) continue master_url = 'http://www.discogs.com/master/%d' % master_id text = u'There are %d distinct Discogs links in this release group, and all point to this master URL.\n' % len( urls) text += u'The name of the Discogs master is “%s” (similarity: %.0f%%)' % ( master_name, ratio * 100) text += u' by %s.' % master_artists out(u' %s\n %s' % (master_url, text)) mb.add_url('release_group', gid, 90, master_url, text)
mangled_track = mangle_name(track) if len(mangled_track) > 4 and mangled_track in page: found_tracks.append(track) ratio = len(found_tracks) * 1.0 / len(tracks) out(' * ratio: %s, has tracks: %s, found tracks: %s' % (ratio, len(tracks), len(found_tracks))) min_ratio = 0.7 if len(rg_name) > 4 else 1.0 if ratio < min_ratio: colored_out(bcolors.WARNING, ' => ratio too low (min = %s)' % min_ratio) continue auto = ratio > 0.75 and (rg_sec_types is None or ('Compilation' not in rg_sec_types and 'Soundtrack' not in rg_sec_types)) text = 'Matched based on the name. The page mentions artist "%s" and %s.' % ( ac_name, join_names('track', found_tracks), ) colored_out(bcolors.OKGREEN, ' * linking to %s' % (url, )) out(' * edit note: %s' % (text, )) time.sleep(5) mb.add_url("release_group", rg_gid, 89, url, text, auto=auto) break if processed is None: db.execute("INSERT INTO bot_wp_rg_link (gid, lang) VALUES (%s, %s)", (rg_gid, wp_lang)) else: db.execute( "UPDATE bot_wp_rg_link SET processed = now() WHERE (gid, lang) = (%s, %s)", (rg_gid, wp_lang))
except ValueError: pass except urllib2.HTTPError: pass for shs_artist in shs_artists: shs_artist_name = mangle_name(re.sub(' \[\d+\]$', '', shs_artist['commonName'])) mb_artist_name = mangle_name(artist['name']) if shs_artist_name == mb_artist_name: artist_uri = shs_artist['uri'] break elif similarity2(to_unicode(shs_artist_name), to_unicode(mb_artist_name)) > 0.85: print " * '%s' has a similarity of %.2f" % (shs_artist['commonName'], similarity2(to_unicode(shs_artist_name), to_unicode(mb_artist_name))) artist_uri = shs_artist['uri'] break if artist_uri: matched_artists.add(artist['gid']) colored_out(bcolors.HEADER, ' * using %s, found artist SHS URL: %s' % (artist['shs_url'], artist_uri)) edit_note = 'Guessing artist SecondHandSongs URL from work https://musicbrainz.org/work/%s linked to %s' % (artist['work_gid'], artist['shs_url']) out(' * edit note: %s' % (edit_note,)) mb.add_url('artist', artist['gid'], str(307), artist_uri, edit_note) else: colored_out(bcolors.NONE, ' * using %s, no artist SHS URL has been found' % (artist['shs_url'],)) if artist['processed'] is None and artist['gid'] not in seen_artists: db.execute("INSERT INTO bot_shs_link_artist (artist) VALUES (%s)", (artist['gid'],)) else: db.execute("UPDATE bot_shs_link_artist SET processed = now() WHERE artist = %s", (artist['gid'],)) seen_artists.add(artist['gid'])
artist_key = 'cdbaby:' + album['artist_cdbaby_id'] if 'type' not in album: album['type'] = 'album' album_url = 'http://www.cdbaby.com/cd/' + album['_id'].split(':')[1] print "adding", album_url if 'artist_mbid' not in album: artist = db.artists.find_one(artist_key) if not artist: artist_url = 'http://www.cdbaby.com/Artist/' + album['artist_cdbaby_id'] mbid = mb.add_artist({'name': album['artist']}, artist_url) artist = {'_id': artist_key, 'mbid': mbid} db.artists.save(artist) print 'added artist', mbid album['artist_mbid'] = artist['mbid'] #pprint.pprint(album) edit_note = album_url mbid = mb.add_release(album, edit_note) mb.add_url('release', mbid, 78, album_url) album['mbid'] = mbid album['status']['imported'] = True db.albums.save(album) print 'added release', mbid #form = album_to_form(album) #print '<form action="http://musicbrainz.org/release/add" method="post">' #for name, value in form.iteritems(): # print '<input type="hidden" name="%s" value="%s" />' % (html_escape(name), html_escape(unicode(value))) #print '<input type="submit" value="Add Release">' #print '</form>'
artist_key = 'cdbaby:' + album['artist_cdbaby_id'] if 'type' not in album: album['type'] = 'album' album_url = 'http://www.cdbaby.com/cd/' + album['_id'].split(':')[1] print "adding", album_url if 'artist_mbid' not in album: artist = db.artists.find_one(artist_key) if not artist: artist_url = 'http://www.cdbaby.com/Artist/' + album['artist_cdbaby_id'] mbid = mb.add_artist({'name': album['artist']}, artist_url) artist = {'_id': artist_key, 'mbid': mbid} db.artists.save(artist) print 'added artist', mbid album['artist_mbid'] = artist['mbid'] #pprint.pprint(album) edit_note = album_url mbid = mb.add_release(album, edit_note) mb.add_url('release', mbid, 78, album_url) album['mbid'] = mbid album['status']['imported'] = True db.albums.save(album) print 'added release', mbid #form = album_to_form(album) #print '<form action="https://musicbrainz.org/release/add" method="post">' #for name, value in form.iteritems(): # print '<input type="hidden" name="%s" value="%s" />' % (html_escape(name), html_escape(unicode(value))) #print '<input type="submit" value="Add Release">' #print '</form>'
mangled_track = mangle_name(track) if len(mangled_track) <= 4 or mangle_name(rg_name) in mangle_name(track): tracks_to_ignore.add(track) tracks -= tracks_to_ignore if len(tracks) < 5: continue for track in tracks: mangled_track = mangle_name(track) if len(mangled_track) > 4 and mangled_track in page: found_tracks.append(track) ratio = len(found_tracks) * 1.0 / len(tracks) out(' * ratio: %s, has tracks: %s, found tracks: %s' % (ratio, len(tracks), len(found_tracks))) min_ratio = 0.7 if len(rg_name) > 4 else 1.0 if ratio < min_ratio: colored_out(bcolors.WARNING, ' => ratio too low (min = %s)' % min_ratio) continue auto = ratio > 0.75 and (rg_sec_types is None or ('Compilation' not in rg_sec_types and 'Soundtrack' not in rg_sec_types)) wp_url = 'https://%s.wikipedia.org/wiki/%s' % (wp_lang, quote_page_title(page_title),) wd_url = 'https://www.wikidata.org/wiki/%s' % wikipage.wikidata_id.upper() text = 'Wikidata identifier found from matching Wikipedia page %s. The page mentions artist "%s" and %s.' % (wp_url, ac_name, join_names('track', found_tracks),) colored_out(bcolors.OKGREEN, ' * linking to %s' % (wd_url,)) out(' * edit note: %s' % (text,)) time.sleep(5) mb.add_url("release_group", rg_gid, 353, wd_url, text, auto=auto) break if processed is None: db.execute("INSERT INTO bot_wp_rg_link (gid, lang) VALUES (%s, %s)", (rg_gid, wp_lang)) else: db.execute("UPDATE bot_wp_rg_link SET processed = now() WHERE (gid, lang) = (%s, %s)", (rg_gid, wp_lang))
found_artists.append(rel_artist) if (found_artists): reasons.append(join_names('related artist', found_artists)) out(' * has related artists: %s, found related artists: %s' % (len(artists), len(found_artists))) # Determine if artist matches if not found_albums and not found_works and not found_artists and not found_urls: continue # Check if wikipedia lang is compatible with artist country if wp_lang != 'en' or wp_lang in acceptable_countries_for_lang: if wp_lang not in acceptable_countries_for_lang: continue country, country_reasons = determine_country(wikipage) if (country not in acceptable_countries_for_lang[wp_lang]): colored_out(bcolors.HEADER, ' * artist country (%s) not compatible with wiki language (%s)' % (country, wp_lang)) continue url = 'http://%s.wikipedia.org/wiki/%s' % (wp_lang, quote_page_title(page_title),) text = 'Matched based on the name. The page mentions %s.' % (', '.join(reasons),) colored_out(bcolors.OKGREEN, ' * linking to %s' % (url,)) out(' * edit note: %s' % (text,)) time.sleep(60) mb.add_url("artist", artist['gid'], 179, url, text) break if artist['processed'] is None: db.execute("INSERT INTO bot_wp_artist_link (gid, lang) VALUES (%s, %s)", (artist['gid'], wp_lang)) else: db.execute("UPDATE bot_wp_artist_link SET processed = now() WHERE (gid, lang) = (%s, %s)", (artist['gid'], wp_lang))
except ValueError: pass except urllib2.HTTPError: pass for shs_artist in shs_artists: shs_artist_name = mangle_name(re.sub(' \[\d+\]$', '', shs_artist['commonName'])) mb_artist_name = mangle_name(artist['name']) if shs_artist_name == mb_artist_name: artist_uri = shs_artist['uri'] break elif similarity2(to_unicode(shs_artist_name), to_unicode(mb_artist_name)) > 0.85: print "%s => similarity = %.2f" % (shs_artist['commonName'], similarity2(to_unicode(shs_artist_name), to_unicode(mb_artist_name))) artist_uri = shs_artist['uri'] break if artist_uri: matched_artists.add(artist['gid']) colored_out(bcolors.HEADER, ' * using %s, found artist SHS URL: %s' % (artist['shs_url'], artist_uri)) edit_note = 'Guessing artist SecondHandSongs URL from work http://musicbrainz.org/work/%s linked to %s' % (artist['work_gid'], artist['shs_url']) out(' * edit note: %s' % (edit_note,)) mb.add_url('artist', artist['gid'], str(307), artist_uri, edit_note) else: colored_out(bcolors.NONE, ' * using %s, no artist SHS URL has been found' % (artist['shs_url'],)) if artist['processed'] is None and artist['gid'] not in seen_artists: db.execute("INSERT INTO bot_shs_link_artist (artist) VALUES (%s)", (artist['gid'],)) else: db.execute("UPDATE bot_shs_link_artist SET processed = now() WHERE artist = %s", (artist['gid'],)) seen_artists.add(artist['gid'])
rg_grouped = rg_by_ac.values() random.shuffle(rg_grouped) for i, (rg, gid, name) in enumerate(itertools.chain(*rg_grouped)): urls = set(u[0] for u in db.execute(query_rg_release_discogs, rg)) if len(urls) < 2: continue out(u'%d/%d - %.2f%%' % (i, count, i * 100.0 / count)) out(u'%s http://musicbrainz.org/release-group/%s' % (name, gid)) masters = list(discogs_get_master(urls)) if len(masters) == 0: out(u' aborting, no Discogs master!') continue if len(set(masters)) > 1: out(u' aborting, releases with different Discogs master in one group!') continue if len(masters) != len(urls): out(u' aborting, releases without Discogs master in group!') continue master_name, master_id, master_artists = masters[0] ratio = Levenshtein.ratio(master_name.lower(), name.lower()) if ratio < 0.8: out(u' Similarity ratio too small: %.2f' % ratio) continue master_url = 'http://www.discogs.com/master/%d' % master_id text = u'There are %d distinct Discogs links in this release group, and all point to this master URL.\n' % len(urls) text += u'The name of the Discogs master is “%s” (similarity: %.0f%%)' % (master_name, ratio * 100) text += u' by %s.' % master_artists out(u' %s\n %s' % (master_url, text)) mb.add_url('release_group', gid, 90, master_url, text)