def create_mbid_mapping(): """ This function is the heart of the mbid mapping. It calculates the intermediate table and then fetches all the recordings from these tables so that duplicate recording-artist pairs all resolve to the "canonical" release-artist pairs that make them suitable for inclusion in the msid-mapping. """ log("mbid mapping: start") with psycopg2.connect(config.DB_CONNECT_MB) as mb_conn: with mb_conn.cursor( cursor_factory=psycopg2.extras.DictCursor) as mb_curs: # Create the dest table (perhaps dropping the old one first) log("mbid mapping: create schema") create_schema(mb_conn) log("mbid mapping: drop old tables, create new tables") create_tables(mb_conn) create_temp_release_table(mb_conn) with mb_conn.cursor() as mb_curs2: rows = [] last_ac_id = None artist_recordings = {} count = 0 batch_count = 0 serial = 1 log("mbid mapping: fetch recordings") mb_curs.execute("""SELECT r.name AS recording_name, r.gid AS recording_mbid, ac.name AS artist_credit_name, ac.id AS artist_credit_id, rl.name AS release_name, rl.gid AS release_mbid, rpr.id AS score FROM recording r JOIN artist_credit ac ON r.artist_credit = ac.id JOIN artist_credit_name acn ON ac.id = acn.artist_credit JOIN track t ON t.recording = r.id JOIN medium m ON m.id = t.medium JOIN release rl ON rl.id = m.release JOIN mapping.tmp_mbid_mapping_releases rpr ON rl.id = rpr.release LEFT JOIN release_country rc ON rc.release = rl.id GROUP BY rpr.id, ac.id, rl.gid, artist_credit_name, r.gid, r.name, release_name ORDER BY ac.id, rpr.id""") while True: row = mb_curs.fetchone() if not row: break if not last_ac_id: last_ac_id = row['artist_credit_id'] if row['artist_credit_id'] != last_ac_id: # insert the rows that made it rows.extend(artist_recordings.values()) artist_recordings = {} if len(rows) > BATCH_SIZE: insert_rows(mb_curs2, "mapping.tmp_mbid_mapping", rows) count += len(rows) mb_conn.commit() rows = [] batch_count += 1 if batch_count % 200 == 0: log("mbid mapping: inserted %d rows." % count) try: recording_name = row['recording_name'] artist_credit_name = row['artist_credit_name'] release_name = row['release_name'] combined_lookup = unidecode( re.sub(r'[^\w]+', '', artist_credit_name + recording_name).lower()) if recording_name not in artist_recordings: artist_recordings[recording_name] = ( serial, recording_name, row['recording_mbid'], artist_credit_name, row['artist_credit_id'], release_name, row['release_mbid'], combined_lookup, row['score']) serial += 1 except TypeError: log(row) raise last_ac_id = row['artist_credit_id'] rows.extend(artist_recordings.values()) if rows: insert_rows(mb_curs2, "mapping.tmp_mbid_mapping", rows) mb_conn.commit() count += len(rows) log("mbid mapping: inserted %d rows total." % count) log("mbid mapping: create indexes") create_indexes(mb_conn) log("mbid mapping: swap tables and indexes into production.") swap_table_and_indexes(mb_conn) log("mbid mapping: done")
def create_year_mapping(): """ This function is the heart of the recording artist pair year mapping. It calculates the intermediate table and then fetches all the recordings from these tables so that duplicate recording-artist pairs all resolve to the "canonical" release-artist pairs that make them suitable for inclusion in the msid-mapping. """ log("year mapping: start") with psycopg2.connect(config.DB_CONNECT_MB) as mb_conn: with mb_conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as mb_curs: # Create the dest table (perhaps dropping the old one first) log("year mapping: drop old tables, create new tables") create_schema(mb_conn) create_tables(mb_conn) create_temp_release_table(mb_conn) with mb_conn.cursor() as mb_curs2: rows = [] last_ac_id = None artist_recordings = {} count = 0 log("year mapping: fetch recordings") mb_curs.execute("""SELECT lower(musicbrainz.musicbrainz_unaccent(r.name)) AS recording_name, lower(musicbrainz.musicbrainz_unaccent(ac.name)) AS artist_credit_name, ac.id AS artist_credit_id, date_year AS year FROM recording r JOIN artist_credit ac ON r.artist_credit = ac.id JOIN artist_credit_name acn ON ac.id = acn.artist_credit JOIN track t ON t.recording = r.id JOIN medium m ON m.id = t.medium JOIN release rl ON rl.id = m.release JOIN mapping.tmp_year_mapping_release rpr ON rl.id = rpr.release LEFT JOIN release_country rc ON rc.release = rl.id ORDER BY ac.id, rpr.id""") while True: row = mb_curs.fetchone() if not row: break if not last_ac_id: last_ac_id = row['artist_credit_id'] if row['artist_credit_id'] != last_ac_id: # insert the rows that made it rows.extend(artist_recordings.values()) artist_recordings = {} if len(rows) > BATCH_SIZE: insert_rows(mb_curs2, "mapping.tmp_year_mapping", rows) count += len(rows) mb_conn.commit() rows = [] recording_name = row['recording_name'] artist_credit_name = row['artist_credit_name'] if recording_name not in artist_recordings: artist_recordings[recording_name] = (recording_name.replace("'", "''"), artist_credit_name.replace("'", "''"), row['year']) last_ac_id = row['artist_credit_id'] rows.extend(artist_recordings.values()) if rows: insert_rows(mb_curs2, "mapping.tmp_year_mapping", rows) mb_conn.commit() count += len(rows) log("year mapping: inserted %d rows total." % count) create_indexes(mb_conn) log("year mapping: swap tables and indexes into production.") swap_table_and_indexes(mb_conn) log("year mapping: done")
def create_pairs(): """ This function is the heart of the recording artist pair mapping. It calculates the intermediate table and then fetches all the recordings from these tables so that duplicate recording-artist pairs all resolve to the "canonical" release-artist pairs that make them suitable for inclusion in the msid-mapping. """ stats = {} stats["started"] = datetime.datetime.utcnow().isoformat() stats["git commit hash"] = subprocess.getoutput("git rev-parse HEAD") with psycopg2.connect(config.DB_CONNECT_MB) as mb_conn: with mb_conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as mb_curs: # Create the dest table (perhaps dropping the old one first) log("Create pairs: drop old tables, create new tables") create_schema(mb_conn) create_tables(mb_conn) stats = create_temp_release_table(mb_conn, stats) mb_curs.execute("SELECT COUNT(*) from musicbrainz.recording") stats["mb_recording_count"] = mb_curs.fetchone()[0] with mb_conn.cursor() as mb_curs2: rows = [] last_ac_id = None artist_recordings = {} count = 0 log("Create pairs: fetch recordings") mb_curs.execute("""SELECT lower(musicbrainz.musicbrainz_unaccent(r.name)) AS recording_name, r.id AS recording_id, lower(musicbrainz.musicbrainz_unaccent(ac.name)) AS artist_credit_name, ac.id AS artist_credit_id, lower(musicbrainz.musicbrainz_unaccent(rl.name)) AS release_name, rl.id as release_id, rpr.id FROM recording r JOIN artist_credit ac ON r.artist_credit = ac.id JOIN artist_credit_name acn ON ac.id = acn.artist_credit JOIN track t ON t.recording = r.id JOIN medium m ON m.id = t.medium JOIN release rl ON rl.id = m.release JOIN mapping.tmp_recording_pair_releases rpr ON rl.id = rpr.release GROUP BY rpr.id, ac.id, rl.id, artist_credit_name, r.id, r.name, release_name ORDER BY ac.id, rpr.id""") log("Create pairs: Insert rows into DB.") while True: row = mb_curs.fetchone() if not row: break if not last_ac_id: last_ac_id = row['artist_credit_id'] if row['artist_credit_id'] != last_ac_id: # insert the rows that made it rows.extend(artist_recordings.values()) artist_recordings = {} if len(rows) > BATCH_SIZE: insert_rows(mb_curs2, "mapping.tmp_recording_artist_credit_pairs", rows) count += len(rows) mb_conn.commit() log("Create pairs: inserted %d rows." % count) rows = [] recording_name = row['recording_name'] artist_credit_name = row['artist_credit_name'] release_name = row['release_name'] if config.REMOVE_NON_WORD_CHARS: recording_name = re.sub(r'\W+', '', recording_name) if recording_name not in artist_recordings: if config.REMOVE_NON_WORD_CHARS: artist_credit_name = re.sub(r'\W+', '', artist_credit_name) release_name = re.sub(r'\W+', '', release_name) artist_recordings[recording_name] = (recording_name, row['recording_id'], artist_credit_name, row['artist_credit_id'], release_name, row['release_id']) last_ac_id = row['artist_credit_id'] rows.extend(artist_recordings.values()) if rows: insert_rows(mb_curs2, "mapping.tmp_recording_artist_credit_pairs", rows) mb_conn.commit() count += len(rows) log("Create pairs: inserted %d rows total." % count) stats["recording_artist_pair_count"] = count log("Create pairs: create indexes") create_indexes(mb_conn) log("Create pairs: swap tables and indexes into production.") swap_table_and_indexes(mb_conn) stats["completed"] = datetime.datetime.utcnow().isoformat() with psycopg2.connect(config.DB_CONNECT_MB) as conn: with conn.cursor() as curs: curs.execute("""INSERT INTO mapping.mapping_stats (stats) VALUES (%s)""", ((ujson.dumps(stats),))) conn.commit() log("done") print()
def create_pairs(): stats = {} stats["started"] = datetime.datetime.utcnow().isoformat() stats["git commit hash"] = subprocess.getoutput("git rev-parse HEAD") with psycopg2.connect(config.DB_CONNECT_MB) as mb_conn: with mb_conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as mb_curs: # Create the dest table (perhaps dropping the old one first) print("Drop/create pairs table") create_schema(mb_conn) create_tables(mb_conn) print("select releases from MB") stats = create_temp_release_table(mb_conn, stats) mb_curs.execute("SELECT COUNT(*) from musicbrainz.recording") stats["mb_recording_count"] = mb_curs.fetchone()[0] with mb_conn.cursor() as mb_curs2: rows = [] last_ac_id = None artist_recordings = {} count = 0 print("Run fetch recordings query") mb_curs.execute(SELECT_RECORDING_PAIRS_QUERY) print("Fetch recordings and insert") while True: row = mb_curs.fetchone() if not row: break if not last_ac_id: last_ac_id = row['artist_credit_id'] if row['artist_credit_id'] != last_ac_id: # insert the rows that made it rows.extend(artist_recordings.values()) artist_recordings = {} if len(rows) > BATCH_SIZE: insert_rows(mb_curs2, "mapping.recording_artist_credit_pairs", rows) count += len(rows) mb_conn.commit() print("inserted %d rows." % count) rows = [] recording_name = row['recording_name'] artist_credit_name = row['artist_credit_name'] release_name = row['release_name'] if config.REMOVE_NON_WORD_CHARS: recording_name = re.sub(r'\W+', '', recording_name) if recording_name not in artist_recordings: if config.REMOVE_NON_WORD_CHARS: artist_credit_name = re.sub(r'\W+', '', artist_credit_name) release_name = re.sub(r'\W+', '', release_name) artist_recordings[recording_name] = (recording_name, row['recording_id'], artist_credit_name, row['artist_credit_id'], release_name, row['release_id']) last_ac_id = row['artist_credit_id'] rows.extend(artist_recordings.values()) if rows: insert_rows(mb_curs2, "mapping.recording_artist_credit_pairs", rows) mb_conn.commit() count += len(rows) print("inserted %d rows total." % count) stats["recording_artist_pair_count"] = count print("Create indexes") create_indexes(mb_conn) stats["completed"] = datetime.datetime.utcnow().isoformat() with open("stats/recording-pairs-stats.json", "w") as f: f.write(ujson.dumps(stats, indent=2) + "\n") print("done")