def create_tables(mb_conn): """ Create tables needed to create the year mapping. First is the temp table that the results will be stored in (in order to not conflict with the production version of this table). Second its format sort table to enables us to sort releases according to preferred format, release date and type. """ # drop/create finished table try: with mb_conn.cursor() as curs: curs.execute("DROP TABLE IF EXISTS mapping.tmp_year_mapping") curs.execute("""CREATE TABLE mapping.tmp_year_mapping ( recording_name TEXT NOT NULL, artist_credit_name TEXT NOT NULL, year INTEGER)""") curs.execute("DROP TABLE IF EXISTS mapping.tmp_year_mapping_release") curs.execute("""CREATE TABLE mapping.tmp_year_mapping_release ( id SERIAL, release INTEGER)""") create_formats_table(mb_conn) mb_conn.commit() except (OperationalError, UndefinedTable) as err: log("year mapping: failed to create recording pair year tables", err) mb_conn.rollback() raise
def create_tables(mb_conn): """ Create tables needed to create the recording artist pairs. First is the temp table that the results will be stored in (in order to not conflict with the production version of this table). Second its format sort table to enables us to sort releases according to preferred format, release date and type. Finally a stats table is created if it doesn't exist. """ # drop/create finished table try: with mb_conn.cursor() as curs: curs.execute("DROP TABLE IF EXISTS mapping.tmp_recording_artist_credit_pairs") curs.execute("""CREATE TABLE mapping.tmp_recording_artist_credit_pairs ( recording_name TEXT NOT NULL, recording_id INTEGER NOT NULL, artist_credit_name TEXT NOT NULL, artist_credit_id INTEGER NOT NULL, release_name TEXT NOT NULL, release_id INTEGER NOT NULL)""") curs.execute("DROP TABLE IF EXISTS mapping.tmp_recording_pair_releases") curs.execute("""CREATE TABLE mapping.tmp_recording_pair_releases ( id SERIAL, release INTEGER)""") create_formats_table(mb_conn) create_stats_table(curs) mb_conn.commit() except (psycopg2.errors.OperationalError, psycopg2.errors.UndefinedTable) as err: log("failed to create recording pair tables", err) mb_conn.rollback() raise
def create_indexes(conn): """ Create indexes for the mapping """ try: with conn.cursor() as curs: curs.execute( """CREATE INDEX tmp_mbid_mapping_idx_artist_credit_recording_name ON mapping.tmp_mbid_mapping(artist_credit_name, recording_name)""" ) # Remove any duplicate rows so we can create a unique index and not get dups in the results curs.execute("""DELETE FROM mapping.tmp_mbid_mapping WHERE id IN ( SELECT id FROM ( SELECT id, combined_lookup, score, row_number() OVER (PARTITION BY combined_lookup ORDER BY score) FROM mapping.tmp_mbid_mapping GROUP BY combined_lookup, score, id ) AS q WHERE row_number > 1)""") curs.execute( """CREATE UNIQUE INDEX tmp_mbid_mapping_idx_combined_lookup ON mapping.tmp_mbid_mapping(combined_lookup)""" ) conn.commit() except OperationalError as err: log("mbid mapping: failed to mbid mapping", err) conn.rollback() raise
def swap_table_and_indexes(conn): """ Swap temp tables and indexes for production tables and indexes. """ try: with conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as curs: curs.execute("DROP TABLE IF EXISTS mapping.mbid_mapping_releases") curs.execute("DROP TABLE IF EXISTS mapping.mbid_mapping") curs.execute("""ALTER TABLE mapping.tmp_mbid_mapping RENAME TO mbid_mapping""") curs.execute("""ALTER TABLE mapping.tmp_mbid_mapping_releases RENAME TO mbid_mapping_releases""") curs.execute( """ALTER INDEX mapping.tmp_mbid_mapping_idx_artist_credit_recording_name RENAME TO mbid_mapping_idx_artist_credit_recording_name""" ) curs.execute( """ALTER INDEX mapping.tmp_mbid_mapping_idx_combined_lookup RENAME TO mbid_mapping_idx_combined_lookup""") curs.execute( """ALTER INDEX mapping.tmp_mbid_mapping_releases_idx_release RENAME TO mbid_mapping_releases_idx_release""") curs.execute( """ALTER INDEX mapping.tmp_mbid_mapping_releases_idx_id RENAME TO mbid_mapping_releases_idx_id""") conn.commit() except OperationalError as err: log("mbid mapping: failed to swap in new mbid mapping tables", str(err)) conn.rollback() raise
def create_tables(mb_conn): """ Create tables needed to create the recording artist pairs. First is the temp table that the results will be stored in (in order to not conflict with the production version of this table). Second its format sort table to enables us to sort releases according to preferred format, release date and type. """ # drop/create finished table try: with mb_conn.cursor() as curs: curs.execute("DROP TABLE IF EXISTS mapping.tmp_mbid_mapping") curs.execute("""CREATE TABLE mapping.tmp_mbid_mapping ( id SERIAL, artist_credit_id INT NOT NULL, artist_mbids UUID[] NOT NULL, artist_credit_name TEXT NOT NULL, release_mbid UUID NOT NULL, release_name TEXT NOT NULL, recording_mbid UUID NOT NULL, recording_name TEXT NOT NULL, combined_lookup TEXT NOT NULL, score INTEGER NOT NULL)""") curs.execute( "DROP TABLE IF EXISTS mapping.tmp_mbid_mapping_releases") curs.execute("""CREATE TABLE mapping.tmp_mbid_mapping_releases ( id SERIAL, release INTEGER)""") create_formats_table(mb_conn) mb_conn.commit() except (psycopg2.errors.OperationalError, psycopg2.errors.UndefinedTable) as err: log("mbid mapping: failed to mbid mapping tables", err) mb_conn.rollback() raise
def sync_release_color_table(): """ Top level function to sync the two CAA and LB cover art tables by fetching all rows sorted by caa_id and adding or removing cover art as needed. """ log("cover art sync starting...") mb_query = """SELECT caa.id AS caa_id , release AS release_id , release.gid AS release_mbid , mime_type FROM cover_art_archive.cover_art caa JOIN cover_art_archive.cover_art_type cat ON cat.id = caa.id JOIN musicbrainz.release ON caa.release = release.id WHERE type_id = 1 AND caa.id > %s ORDER BY caa.id LIMIT %s""" lb_query = """ SELECT caa_id FROM release_color WHERE caa_id > %s ORDER BY caa_id LIMIT %s""" compare_coverart(mb_query, lb_query, 0, 0, "caa_id", "caa_id")
def create_table(conn): """ Given a valid postgres connection, create temporary tables to generate the mapping into. These temporary tables will later be swapped out with the producton tables in a single transaction. """ try: with conn.cursor() as curs: curs.execute("DROP TABLE IF EXISTS mapping.tmp_msid_mbid_mapping") curs.execute("""CREATE TABLE mapping.tmp_msid_mbid_mapping ( count INTEGER, msb_artist_name TEXT, msb_artist_msid UUID, msb_recording_name TEXT, msb_recording_msid UUID, msb_release_name TEXT, msb_release_msid UUID, mb_artist_name TEXT, mb_artist_credit_id INTEGER, mb_recording_name TEXT, mb_recording_id INTEGER, mb_release_name TEXT, mb_release_id INTEGER, source TEXT)""") create_stats_table(curs) conn.commit() except DuplicateTable as err: log("Cannot drop/create tables: ", str(err)) conn.rollback() raise
def swap_table_and_indexes(conn): """ Swap temp tables and indexes for production tables and indexes. """ try: with conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as curs: curs.execute("DROP TABLE mapping.recording_pair_releases") curs.execute("DROP TABLE mapping.recording_artist_credit_pairs") curs.execute("""ALTER TABLE mapping.tmp_recording_artist_credit_pairs RENAME TO recording_artist_credit_pairs""") curs.execute("""ALTER TABLE mapping.tmp_recording_pair_releases RENAME TO recording_pair_releases""") curs.execute("""ALTER INDEX mapping.tmp_recording_artist_credit_pairs_idx_artist_credit_name RENAME TO recording_artist_credit_pairs_idx_artist_credit_name""") curs.execute("""ALTER INDEX mapping.tmp_recording_pair_releases_idx_release RENAME TO recording_pair_releases_idx_release""") curs.execute("""ALTER INDEX mapping.tmp_recording_pair_releases_idx_id RENAME TO recording_pair_releases_idx_id""") conn.commit() except OperationalError as err: log("failed to swap in new recording pair tables", str(err)) conn.rollback() raise
def cron_log(): """ Print the internal cron log file for debugging purposes. """ if os.path.exists(CRON_LOG_FILE): log("Current cron job log file:") subprocess.run(["cat", CRON_LOG_FILE]) else: log("Log file is empty")
def build(client, collection_name): schema = { 'name': collection_name, 'fields': [ { 'name': 'combined', 'type': 'string' }, { 'name': 'score', 'type': 'int32' }, ], 'default_sorting_field': 'score' } client.collections.create(schema) with psycopg2.connect(config.MBID_MAPPING_DATABASE_URI) as conn: with conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as curs: curs.execute("SELECT max(score) FROM mapping.mbid_mapping") max_score = curs.fetchone()[0] query = ("""SELECT recording_name, recording_mbid, release_name, release_mbid, artist_credit_id, artist_credit_name, artist_mbids, score FROM mapping.mbid_mapping""") curs.execute(query) documents = [] for i, row in enumerate(curs): document = dict(row) document['artist_mbids'] = row["artist_mbids"][1:-1] document['score'] = max_score - document['score'] document['combined'] = prepare_string(document['recording_name'] + " " + document['artist_credit_name']) documents.append(document) if len(documents) == BATCH_SIZE: client.collections[collection_name].documents.import_(documents) documents = [] if i and i % 1000000 == 0: log("typesense index: Indexed %d rows" % i) if documents: client.collections[collection_name].documents.import_(documents) log("typesense index: indexing complete. waiting for background tasks to finish.") time.sleep(5)
def create_indexes(conn): """ Create indexes for the year mapping """ try: with conn.cursor() as curs: curs.execute("""CREATE INDEX tmp_year_mapping_idx_ac_rec_year ON mapping.tmp_year_mapping(artist_credit_name, recording_name)""") conn.commit() except OperationalError as err: log("year mapping: failed to create recording pair index", err) conn.rollback() raise
def create_indexes(conn): """ Create indexes for the recording artist pairs """ try: with conn.cursor() as curs: curs.execute("""CREATE INDEX tmp_recording_artist_credit_pairs_idx_artist_credit_name ON mapping.tmp_recording_artist_credit_pairs(artist_credit_name)""") conn.commit() except OperationalError as err: log("failed to create recording pair index", err) conn.rollback() raise
def create_indexes(mb_conn): """ Create the user_name index on the tracks of the year table. """ try: with mb_conn.cursor() as curs: curs.execute("""CREATE INDEX tracks_of_the_year_ndx_user_name ON mapping.tracks_of_the_year (user_name)""" ) mb_conn.commit() except (psycopg2.errors.OperationalError, psycopg2.errors.UndefinedTable) as err: log("mbid mapping: failed to create tracks of the year indexes", err) mb_conn.rollback() raise
def load_MB_recordings(): """ Load all of the recording artists pairs into ram and sort them for matching. """ mb_recordings = [] with psycopg2.connect(config.DB_CONNECT_MB) as conn: with conn.cursor() as curs: query = """SELECT DISTINCT lower(public.unaccent(artist_credit_name::TEXT)) as artist_credit_name, artist_credit_id, lower(public.unaccent(recording_name::TEXT)) AS recording_name, recording_id, lower(public.unaccent(release_name::TEXT)) AS release_name, release_id FROM mapping.recording_artist_credit_pairs""" if config.USE_MINIMAL_DATASET: query += " WHERE artist_credit_id = 1160983" curs.execute(query) while True: mb_row = curs.fetchone() if not mb_row: break artist = mb_row[0] artist_credit_id = int(mb_row[1]) recording = mb_row[2] recording_id = int(mb_row[3]) release = mb_row[4] release_id = int(mb_row[5]) if config.REMOVE_NON_WORD_CHARS: artist = re.sub(r'\W+', '', artist) recording = re.sub(r'\W+', '', recording) release = re.sub(r'\W+', '', release) mb_recordings.append({ "artist_name": artist, "artist_credit_id": artist_credit_id, "recording_name": recording, "recording_id": recording_id, "release_name": release, "release_id": release_id, }) log("loaded %d MB recordings, now sorting" % len(mb_recordings)) mb_recording_index = list(range(len(mb_recordings))) mb_recording_index = sorted(mb_recording_index, key=lambda rec: (mb_recordings[rec]["artist_name"], mb_recordings[rec]["recording_name"])) return (mb_recordings, mb_recording_index)
def fetch_tracks_listened_to(lb_conn, mb_conn, start_ts, end_ts): """ Actually fetch the top discoveries for the given year and set of users """ with lb_conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as lb_curs: with mb_conn.cursor( cursor_factory=psycopg2.extras.DictCursor) as mb_curs: log("create tracks listened table") create_table(mb_conn) log("fetch tracks listened to") # Fetch the basic data for all tracks that users listened to in a given year. query = """SELECT user_name , m.recording_mbid , md.recording_name , md.artist_credit_name , md.artist_mbids , count(*) AS listen_count FROM listen l JOIN mbid_mapping m ON data->'track_metadata'->'additional_info'->>'recording_msid' = m.recording_msid::TEXT JOIN mbid_mapping_metadata md ON m.recording_mbid = md.recording_mbid WHERE listened_at >= %s AND listened_at < %s AND m.recording_mbid is not null GROUP BY m.recording_mbid, md.recording_name, md.artist_credit_name, md.artist_mbids, user_name""" % ( start_ts, end_ts) to_insert = [] lb_curs.execute(query) while True: row = lb_curs.fetchone() if not row: break to_insert.append(row) if len(to_insert) >= BATCH_SIZE: insert_rows(mb_curs, "mapping.tracks_of_the_year", to_insert) to_insert = [] mb_conn.commit() insert_rows(mb_curs, "mapping.tracks_of_the_year", to_insert) mb_conn.commit()
def fetch_top_discoveries_for_users(lb_conn, mb_conn, year): """ Actually fetch the top discoveries for the given year""" with lb_conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as lb_curs: with mb_conn.cursor( cursor_factory=psycopg2.extras.DictCursor) as mb_curs: log("crate top_listens table") create_table(mb_conn) log("fetch active users") user_list = fetch_user_list(lb_conn, year) log("Process %d users." % len(user_list)) # This query will select all listens/data for a given user list and create an array of the years when # a track was listened to. This data directly is not useful for anything directly, but from this # a few types of playlists can be generated. query = """SELECT user_name , track_name , data->'track_metadata'->>'artist_name' AS artist_name , array_agg(extract(year from to_timestamp(listened_at))::INT ORDER BY extract(year from to_timestamp(listened_at))::INT) AS years , m.recording_mbid , mm.artist_mbids FROM listen FULL OUTER JOIN mbid_mapping m ON (data->'track_metadata'->'additional_info'->>'recording_msid')::uuid = m.recording_msid FULL OUTER JOIN mbid_mapping_metadata mm ON mm.recording_mbid = m.recording_mbid WHERE user_name in %s AND mm.recording_mbid IS NOT NULL GROUP BY user_name, artist_name, mm.artist_mbids, track_name, m.recording_mbid HAVING (array_agg(extract(year from to_timestamp(listened_at))::INT ORDER BY extract(year from to_timestamp(listened_at))::INT))[1] = %s ORDER BY user_name, array_length(array_agg(extract(year from to_timestamp(listened_at))::INT), 1) DESC""" for users in chunks(user_list, USERS_PER_BATCH): log(users) lb_curs.execute(query, (tuple(users), year)) top_recordings = [] while True: row = lb_curs.fetchone() if not row: break if len(row["years"]) > 2: top_recordings.append( (row["recording_mbid"], row["track_name"], row["artist_name"], row["artist_mbids"], len(row["years"]), row["user_name"])) print("insert %d rows" % len(top_recordings)) insert_rows(mb_curs, "mapping.top_discoveries", top_recordings) mb_conn.commit()
def incremental_update_release_color_table(): """ Incrementally update the cover art mapping. This is designed to run hourly and save a last_updated timestamp in the cache. If the cache value cannot be found, a complete sync is run instead and the cache value is set. """ cache.init(host=config.REDIS_HOST, port=config.REDIS_PORT, namespace=config.REDIS_NAMESPACE) try: last_updated = cache.get(LAST_UPDATED_CACHE_KEY, decode=True) or None except Exception: last_updated = None if not last_updated: log("No timestamp found, performing full sync") sync_release_color_table() last_updated = get_last_updated_from_caa() cache.set(LAST_UPDATED_CACHE_KEY, last_updated, expirein=0, encode=True) return log("cover art incremental update starting...") mb_query = """SELECT caa.id AS caa_id , release AS release_id , release.gid AS release_mbid , mime_type , date_uploaded FROM cover_art_archive.cover_art caa JOIN cover_art_archive.cover_art_type cat ON cat.id = caa.id JOIN musicbrainz.release ON caa.release = release.id WHERE type_id = 1 AND caa.date_uploaded > %s ORDER BY caa.date_uploaded LIMIT %s""" compare_coverart(mb_query, None, last_updated, None, "date_uploaded", "last_updated") last_updated = get_last_updated_from_caa() cache.set(LAST_UPDATED_CACHE_KEY, last_updated, expirein=0, encode=True)
def swap_table_and_indexes(conn): """ This function swaps the temporary files that the mapping was written for the production tables, inside a single transaction. This should isolate the end users from ever seeing any down time in mapping availability. """ try: with conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as curs: curs.execute("DROP TABLE mapping.msid_mbid_mapping") curs.execute("""ALTER TABLE mapping.tmp_msid_mbid_mapping RENAME TO msid_mbid_mapping""") curs.execute( """ALTER INDEX mapping.tmp_msid_mbid_mapping_idx_msb_recording_name RENAME TO msid_mbid_mapping_idx_msb_recording_name""" ) curs.execute( """ALTER INDEX mapping.tmp_msid_mbid_mapping_idx_msb_recording_msid RENAME TO msid_mbid_mapping_idx_msb_recording_msid""" ) curs.execute( """ALTER INDEX mapping.tmp_msid_mbid_mapping_idx_msb_artist_name RENAME TO msid_mbid_mapping_idx_msb_artist_name""" ) curs.execute( """ALTER INDEX mapping.tmp_msid_mbid_mapping_idx_msb_artist_msid RENAME TO msid_mbid_mapping_idx_msb_artist_msid""" ) curs.execute( """ALTER INDEX mapping.tmp_msid_mbid_mapping_idx_msb_release_name RENAME TO msid_mbid_mapping_idx_msb_release_name""" ) curs.execute( """ALTER INDEX mapping.tmp_msid_mbid_mapping_idx_msb_release_msid RENAME TO msid_mbid_mapping_idx_msb_release_msid""" ) conn.commit() except OperationalError as err: log("failed to swap in new mapping table", str(err)) conn.rollback() raise
def create_table(mb_conn): """ Create the tracks of the year table in the mapping schema of a docker-musicbrinz instance. """ try: with mb_conn.cursor() as curs: curs.execute("DROP TABLE IF EXISTS mapping.tracks_of_the_year") curs.execute("""CREATE TABLE mapping.tracks_of_the_year ( user_name TEXT NOT NULL , recording_mbid UUID NOT NULL , recording_name TEXT NOT NULL , artist_credit_name TEXT NOT NULL , artist_mbids UUID[] NOT NULL , listen_count INTEGER NOT NULL )""") mb_conn.commit() except (psycopg2.errors.OperationalError, psycopg2.errors.UndefinedTable) as err: log("mbid mapping: failed to create tracks of the year table", err) mb_conn.rollback() raise
def swap_table_and_indexes(conn): """ Swap temp tables and indexes for production tables and indexes. """ try: with conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as curs: curs.execute("DROP TABLE IF EXISTS mapping.year_mapping") curs.execute("DROP TABLE IF EXISTS mapping.year_mapping_release") curs.execute("""ALTER TABLE mapping.tmp_year_mapping_release RENAME TO year_mapping_release""") curs.execute("""ALTER TABLE mapping.tmp_year_mapping RENAME TO year_mapping""") curs.execute("""ALTER INDEX mapping.tmp_year_mapping_idx_ac_rec_year RENAME TO year_mapping_idx_ac_rec_year""") conn.commit() except OperationalError as err: log("year mapping: failed to swap in new recording pair tables", str(err)) conn.rollback() raise
def create_indexes(conn): """ Create the indexes on the mapping. """ try: with conn.cursor() as curs: curs.execute( """CREATE INDEX tmp_msid_mbid_mapping_idx_msb_recording_name ON mapping.tmp_msid_mbid_mapping(msb_recording_name)""" ) curs.execute( """CREATE INDEX tmp_msid_mbid_mapping_idx_msb_recording_msid ON mapping.tmp_msid_mbid_mapping(msb_recording_msid)""" ) curs.execute( """CREATE INDEX tmp_msid_mbid_mapping_idx_msb_artist_name ON mapping.tmp_msid_mbid_mapping(msb_artist_name)""" ) curs.execute( """CREATE INDEX tmp_msid_mbid_mapping_idx_msb_artist_msid ON mapping.tmp_msid_mbid_mapping(msb_artist_msid)""" ) curs.execute( """CREATE INDEX tmp_msid_mbid_mapping_idx_msb_release_name ON mapping.tmp_msid_mbid_mapping(msb_release_name)""" ) curs.execute( """CREATE INDEX tmp_msid_mbid_mapping_idx_msb_release_msid ON mapping.tmp_msid_mbid_mapping(msb_release_msid)""" ) conn.commit() except OperationalError as err: conn.rollback() log("creating indexes failed.") raise
def create_temp_release_table(conn, stats): """ Creates an intermediate table that orders releases by types, format, releases date, country and artist_credit. This sorting should in theory sort the most desired releases (albums, digital releases, first released) over the other types in order to match to the "canonical" releases and to also ensure that tracks that came from one release will be matched to the same release and will not end up being scattered across many releases from the same artist. """ with conn.cursor() as curs: log("Create temp release table: select") query = """INSERT INTO mapping.tmp_recording_pair_releases (release) SELECT r.id FROM musicbrainz.release_group rg JOIN musicbrainz.release r ON rg.id = r.release_group JOIN musicbrainz.release_country rc ON rc.release = r.id JOIN musicbrainz.medium m ON m.release = r.id JOIN musicbrainz.medium_format mf ON m.format = mf.id JOIN mapping.format_sort fs ON mf.id = fs.format JOIN musicbrainz.artist_credit ac ON rg.artist_credit = ac.id JOIN musicbrainz.release_group_primary_type rgpt ON rg.type = rgpt.id FULL OUTER JOIN musicbrainz.release_group_secondary_type_join rgstj ON rg.id = rgstj.release_group FULL OUTER JOIN musicbrainz.release_group_secondary_type rgst ON rgstj.secondary_type = rgst.id WHERE rg.artist_credit != 1 %s ORDER BY rg.type, rgst.id desc, fs.sort, to_date(date_year::TEXT || '-' || COALESCE(date_month,12)::TEXT || '-' || COALESCE(date_day,28)::TEXT, 'YYYY-MM-DD'), country, rg.artist_credit, rg.name""" if config.USE_MINIMAL_DATASET: log("Create temp release table: Using a minimal dataset!") curs.execute(query % ('AND rg.artist_credit = %d' % TEST_ARTIST_ID)) else: curs.execute(query % "") log("Create temp release table: create indexes") curs.execute("""CREATE INDEX tmp_recording_pair_releases_idx_release ON mapping.tmp_recording_pair_releases(release)""") curs.execute("""CREATE INDEX tmp_recording_pair_releases_idx_id ON mapping.tmp_recording_pair_releases(id)""") curs.execute("SELECT COUNT(*) from mapping.tmp_recording_pair_releases") stats["recording_pair_release_count"] = curs.fetchone()[0] curs.execute("SELECT COUNT(*) from musicbrainz.release") stats["mb_release_count"] = curs.fetchone()[0] return stats
def process_row(row): """ Process one CAA query row, by fetching the 250px thumbnail, process the color, then import into the DB """ sleep_duation = 2 while True: headers = { 'User-Agent': 'ListenBrainz HueSound Color Bot ( [email protected] )'} release_mbid, caa_id = row["release_mbid"], row["caa_id"] url = f"https://archive.org/download/mbid-{release_mbid}/mbid-{release_mbid}-{caa_id}_thumb250.jpg" r = requests.get(url, headers=headers) if r.status_code == 200: filename = "/tmp/release-colors-%s.img" % get_ident() with open(filename, 'wb') as f: for chunk in r: f.write(chunk) try: red, green, blue = process_image(filename, row["mime_type"]) insert_row(row["release_mbid"], red, green, blue, row["caa_id"]) log("%s %s: (%s, %s, %s)" % (row["caa_id"], row["release_mbid"], red, green, blue)) except Exception as err: log("Could not process %s" % url) log(err) os.unlink(filename) sleep_duation = 2 break if r.status_code == 403: break if r.status_code == 404: break if r.status_code == 429: log("Exceeded rate limit. sleeping %d seconds." % sleep_duration) sleep(sleep_duration) sleep_duration *= 2 if sleep_duration > 100: return continue if r.status_code == 503: log("Service not available. sleeping %d seconds." % sleep_duration) sleep(sleep_duration) sleep_duration *= 2 if sleep_duration > 100: return continue log("Unhandled %d" % r.status_code) break
def compare_coverart(mb_query, lb_query, mb_caa_index, lb_caa_index, mb_compare_key, lb_compare_key): """ The core cover art comparison function. Given two sets of queries, index values, and comparison keys this function can perform a complete sync as well as an incremental update. The queries must fetch chunks of data from the MB and LB tables ordered by the corresponding compare key. The starting indexes (the current comparison index into the data) must be provided and match the type of the comparison keys. """ with psycopg2.connect(config.MBID_MAPPING_DATABASE_URI) as mb_conn: with mb_conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as mb_curs: with psycopg2.connect(config.SQLALCHEMY_DATABASE_URI) as lb_conn: with lb_conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as lb_curs: mb_count, lb_count = get_cover_art_counts(mb_curs, lb_curs) log("CAA count: %d\n LB count: %d" % (mb_count, lb_count)) threads = [] mb_row = None lb_row = None mb_rows = [] lb_rows = [] mb_done = False lb_done = True if lb_query is None else False extra = 0 missing = 0 processed = 0 while True: if len(mb_rows) == 0 and not mb_done: mb_curs.execute( mb_query, (mb_caa_index, SYNC_BATCH_SIZE)) mb_rows = mb_curs.fetchall() if len(mb_rows) == 0: mb_done = True if len(lb_rows) == 0 and not lb_done: lb_curs.execute( lb_query, (lb_caa_index, SYNC_BATCH_SIZE)) lb_rows = lb_curs.fetchall() if len(lb_rows) == 0: lb_done = True if not mb_row and len(mb_rows) > 0: mb_row = mb_rows.pop(0) if not lb_row and len(lb_rows) > 0: lb_row = lb_rows.pop(0) if not lb_row and not mb_row: break processed += 1 if processed % 100000 == 0: log("processed %d of %d: missing %d extra %d" % (processed, mb_count, missing, extra)) # If the item is in MB, but not in LB, add to LB if lb_row is None or mb_row[mb_compare_key] < lb_row[lb_compare_key]: process_cover_art(threads, mb_row) missing += 1 mb_caa_index = mb_row[mb_compare_key] mb_row = None continue # If the item is in LB, but not in MB, remove from LB if mb_row is None or mb_row[mb_compare_key] > lb_row[lb_compare_key]: extra += 1 delete_from_lb(lb_row[lb_compare_key]) lb_caa_index = lb_row[lb_compare_key] lb_row = None continue # If the caa_id is present in both, skip both if mb_row[mb_compare_key] == lb_row[lb_compare_key]: mb_caa_index = mb_row[mb_compare_key] lb_caa_index = lb_row[lb_compare_key] lb_row = None mb_row = None continue assert False join_threads(threads) log( "Finished! added/skipped %d removed %d from release_color" % (missing, extra)) mb_count, lb_count = get_cover_art_counts(mb_curs, lb_curs) log("CAA count: %d\n LB count: %d" % (mb_count, lb_count)) metrics.init("listenbrainz") metrics.set("listenbrainz-caa-mapper", caa_front_count=mb_count, lb_caa_count=lb_count)
def build(client, collection_name): schema = { 'name': collection_name, 'fields': [ { 'name': 'combined', 'type': 'string' }, { 'name': 'score', 'type': 'int32' }, ], 'default_sorting_field': 'score' } client.collections.create(schema) with psycopg2.connect(config.DB_CONNECT_MB) as conn: with conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as curs: curs.execute("SELECT max(score) FROM mapping.mbid_mapping") max_score = curs.fetchone()[0] query = ("""SELECT recording_name AS recording_name, r.gid AS recording_mbid, release_name AS release_name, rl.gid AS release_mbid, artist_credit_name AS artist_credit_name, artist_credit_id, score FROM mapping.mbid_mapping JOIN recording r ON r.id = recording_id JOIN release rl ON rl.id = release_id""") if config.USE_MINIMAL_DATASET: query += " WHERE artist_credit_id = 1160983" curs.execute(query) documents = [] for i, row in enumerate(curs): document = dict(row) document['score'] = max_score - document['score'] document['combined'] = prepare_string( document['recording_name'] + " " + document['artist_credit_name']) documents.append(document) if len(documents) == BATCH_SIZE: client.collections[collection_name].documents.import_( documents) documents = [] if i and i % 1000000 == 0: log("typesense index: Indexed %d rows" % i) if documents: client.collections[collection_name].documents.import_( documents) log("typesense index: indexing complete. waiting for background tasks to finish." ) time.sleep(5)
def build_index(): client = typesense.Client({ 'nodes': [{ 'host': config.TYPESENSE_HOST, 'port': config.TYPESENSE_PORT, 'protocol': 'http', }], 'api_key': config.TYPESENSE_API_KEY, 'connection_timeout_seconds': 1000000 }) collection_name = COLLECTION_NAME_PREFIX + datetime.datetime.now( ).strftime('%Y%m%d_%H%M%S') try: log("typesense index: build index '%s'" % collection_name) build(client, collection_name) except typesense.exceptions.TypesenseClientError as err: log("typesense index: Cannot build index: ", str(err)) return -1 try: latest = COLLECTION_NAME_PREFIX + "latest" log("typesense index: alias index '%s' to %s" % (collection_name, latest)) aliased_collection = {"collection_name": collection_name} client.aliases.upsert(latest, aliased_collection) except typesense.exceptions.TypesenseClientError as err: log("typesense index: Cannot build index: ", str(err)) return -2 try: for collection in client.collections.retrieve(): if collection["name"] == collection_name: continue if collection["name"].startswith(COLLECTION_NAME_PREFIX): log("typesense index: delete collection '%s'" % collection["name"]) client.collections[collection["name"]].delete() else: log("typesense index: ignore collection '%s'" % collection["name"]) except typesense.exceptions.ObjectNotFound: log("typesense index: Failed to delete collection '%s'.", str(err)) return 0
def create_temp_release_table(conn): """ Creates an intermediate table that orders releases by types, format, releases date, country and artist_credit. This sorting should in theory sort the most desired releases (albums, digital releases, first released) over the other types in order to match to the "canonical" releases and to also ensure that tracks that came from one release will be matched to the same release and will not end up being scattered across many releases from the same artist. """ with conn.cursor() as curs: log("mbid mapping temp tables: Create temp release table: select") query = """ SELECT r.id AS release FROM musicbrainz.release_group rg JOIN musicbrainz.release r ON rg.id = r.release_group LEFT JOIN musicbrainz.release_country rc ON rc.release = r.id JOIN musicbrainz.medium m ON m.release = r.id JOIN musicbrainz.medium_format mf ON m.format = mf.id JOIN mapping.format_sort fs ON mf.id = fs.format JOIN musicbrainz.artist_credit ac ON rg.artist_credit = ac.id JOIN musicbrainz.release_group_primary_type rgpt ON rg.type = rgpt.id LEFT JOIN musicbrainz.release_group_secondary_type_join rgstj ON rg.id = rgstj.release_group LEFT JOIN musicbrainz.release_group_secondary_type rgst ON rgstj.secondary_type = rgst.id WHERE rg.artist_credit != 1 %s ORDER BY rg.type, rgst.id desc, fs.sort, to_date(date_year::TEXT || '-' || COALESCE(date_month,12)::TEXT || '-' || COALESCE(date_day,28)::TEXT, 'YYYY-MM-DD'), country, rg.artist_credit, rg.name""" if config.USE_MINIMAL_DATASET: log("mbid mapping temp tables: Using a minimal dataset for artist credit pairs" ) curs.execute(query % ('AND rg.artist_credit = %d' % TEST_ARTIST_ID)) else: log("mbid mapping temp tables: Using a full dataset for artist credit pairs" ) curs.execute(query % "") # Fetch releases and toss out duplicates -- using DISTINCT in the query above is not possible as it will # destroy the sort order we so carefully crafted. with conn.cursor() as curs_insert: rows = [] count = 0 release_index = {} for row in curs: if row[0] in release_index: continue release_index[row[0]] = 1 count += 1 rows.append((count, row[0])) if len(rows) == BATCH_SIZE: insert_rows(curs_insert, "mapping.tmp_mbid_mapping_releases", rows) rows = [] if count % 1000000 == 0: log("mbid mapping temp tables: inserted %s rows." % count) if rows: insert_rows(curs_insert, "mapping.tmp_mbid_mapping_releases", rows) log("mbid mapping temp tables: create indexes") curs.execute("""CREATE INDEX tmp_mbid_mapping_releases_idx_release ON mapping.tmp_mbid_mapping_releases(release)""" ) curs.execute("""CREATE INDEX tmp_mbid_mapping_releases_idx_id ON mapping.tmp_mbid_mapping_releases(id)""") log("mbid mapping temp tables: done")
def create_mbid_mapping(): """ This function is the heart of the mbid mapping. It calculates the intermediate table and then fetches all the recordings from these tables so that duplicate recording-artist pairs all resolve to the "canonical" release-artist pairs that make them suitable for inclusion in the msid-mapping. """ log("mbid mapping: start") with psycopg2.connect(config.DB_CONNECT_MB) as mb_conn: with mb_conn.cursor( cursor_factory=psycopg2.extras.DictCursor) as mb_curs: # Create the dest table (perhaps dropping the old one first) log("mbid mapping: create schema") create_schema(mb_conn) log("mbid mapping: drop old tables, create new tables") create_tables(mb_conn) create_temp_release_table(mb_conn) with mb_conn.cursor() as mb_curs2: rows = [] last_ac_id = None artist_recordings = {} count = 0 batch_count = 0 serial = 1 log("mbid mapping: fetch recordings") mb_curs.execute("""SELECT r.name AS recording_name, r.gid AS recording_mbid, ac.name AS artist_credit_name, ac.id AS artist_credit_id, rl.name AS release_name, rl.gid AS release_mbid, rpr.id AS score FROM recording r JOIN artist_credit ac ON r.artist_credit = ac.id JOIN artist_credit_name acn ON ac.id = acn.artist_credit JOIN track t ON t.recording = r.id JOIN medium m ON m.id = t.medium JOIN release rl ON rl.id = m.release JOIN mapping.tmp_mbid_mapping_releases rpr ON rl.id = rpr.release LEFT JOIN release_country rc ON rc.release = rl.id GROUP BY rpr.id, ac.id, rl.gid, artist_credit_name, r.gid, r.name, release_name ORDER BY ac.id, rpr.id""") while True: row = mb_curs.fetchone() if not row: break if not last_ac_id: last_ac_id = row['artist_credit_id'] if row['artist_credit_id'] != last_ac_id: # insert the rows that made it rows.extend(artist_recordings.values()) artist_recordings = {} if len(rows) > BATCH_SIZE: insert_rows(mb_curs2, "mapping.tmp_mbid_mapping", rows) count += len(rows) mb_conn.commit() rows = [] batch_count += 1 if batch_count % 200 == 0: log("mbid mapping: inserted %d rows." % count) try: recording_name = row['recording_name'] artist_credit_name = row['artist_credit_name'] release_name = row['release_name'] combined_lookup = unidecode( re.sub(r'[^\w]+', '', artist_credit_name + recording_name).lower()) if recording_name not in artist_recordings: artist_recordings[recording_name] = ( serial, recording_name, row['recording_mbid'], artist_credit_name, row['artist_credit_id'], release_name, row['release_mbid'], combined_lookup, row['score']) serial += 1 except TypeError: log(row) raise last_ac_id = row['artist_credit_id'] rows.extend(artist_recordings.values()) if rows: insert_rows(mb_curs2, "mapping.tmp_mbid_mapping", rows) mb_conn.commit() count += len(rows) log("mbid mapping: inserted %d rows total." % count) log("mbid mapping: create indexes") create_indexes(mb_conn) log("mbid mapping: swap tables and indexes into production.") swap_table_and_indexes(mb_conn) log("mbid mapping: done")
def cron_log(): if os.path.exists(CRON_LOG_FILE): log("Current cron job log file:") subprocess.run(["cat", CRON_LOG_FILE]) else: log("Log file is empty")
def create_pairs(): """ This function is the heart of the recording artist pair mapping. It calculates the intermediate table and then fetches all the recordings from these tables so that duplicate recording-artist pairs all resolve to the "canonical" release-artist pairs that make them suitable for inclusion in the msid-mapping. """ stats = {} stats["started"] = datetime.datetime.utcnow().isoformat() stats["git commit hash"] = subprocess.getoutput("git rev-parse HEAD") with psycopg2.connect(config.DB_CONNECT_MB) as mb_conn: with mb_conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as mb_curs: # Create the dest table (perhaps dropping the old one first) log("Create pairs: drop old tables, create new tables") create_schema(mb_conn) create_tables(mb_conn) stats = create_temp_release_table(mb_conn, stats) mb_curs.execute("SELECT COUNT(*) from musicbrainz.recording") stats["mb_recording_count"] = mb_curs.fetchone()[0] with mb_conn.cursor() as mb_curs2: rows = [] last_ac_id = None artist_recordings = {} count = 0 log("Create pairs: fetch recordings") mb_curs.execute("""SELECT lower(musicbrainz.musicbrainz_unaccent(r.name)) AS recording_name, r.id AS recording_id, lower(musicbrainz.musicbrainz_unaccent(ac.name)) AS artist_credit_name, ac.id AS artist_credit_id, lower(musicbrainz.musicbrainz_unaccent(rl.name)) AS release_name, rl.id as release_id, rpr.id FROM recording r JOIN artist_credit ac ON r.artist_credit = ac.id JOIN artist_credit_name acn ON ac.id = acn.artist_credit JOIN track t ON t.recording = r.id JOIN medium m ON m.id = t.medium JOIN release rl ON rl.id = m.release JOIN mapping.tmp_recording_pair_releases rpr ON rl.id = rpr.release GROUP BY rpr.id, ac.id, rl.id, artist_credit_name, r.id, r.name, release_name ORDER BY ac.id, rpr.id""") log("Create pairs: Insert rows into DB.") while True: row = mb_curs.fetchone() if not row: break if not last_ac_id: last_ac_id = row['artist_credit_id'] if row['artist_credit_id'] != last_ac_id: # insert the rows that made it rows.extend(artist_recordings.values()) artist_recordings = {} if len(rows) > BATCH_SIZE: insert_rows(mb_curs2, "mapping.tmp_recording_artist_credit_pairs", rows) count += len(rows) mb_conn.commit() log("Create pairs: inserted %d rows." % count) rows = [] recording_name = row['recording_name'] artist_credit_name = row['artist_credit_name'] release_name = row['release_name'] if config.REMOVE_NON_WORD_CHARS: recording_name = re.sub(r'\W+', '', recording_name) if recording_name not in artist_recordings: if config.REMOVE_NON_WORD_CHARS: artist_credit_name = re.sub(r'\W+', '', artist_credit_name) release_name = re.sub(r'\W+', '', release_name) artist_recordings[recording_name] = (recording_name, row['recording_id'], artist_credit_name, row['artist_credit_id'], release_name, row['release_id']) last_ac_id = row['artist_credit_id'] rows.extend(artist_recordings.values()) if rows: insert_rows(mb_curs2, "mapping.tmp_recording_artist_credit_pairs", rows) mb_conn.commit() count += len(rows) log("Create pairs: inserted %d rows total." % count) stats["recording_artist_pair_count"] = count log("Create pairs: create indexes") create_indexes(mb_conn) log("Create pairs: swap tables and indexes into production.") swap_table_and_indexes(mb_conn) stats["completed"] = datetime.datetime.utcnow().isoformat() with psycopg2.connect(config.DB_CONNECT_MB) as conn: with conn.cursor() as curs: curs.execute("""INSERT INTO mapping.mapping_stats (stats) VALUES (%s)""", ((ujson.dumps(stats),))) conn.commit() log("done") print()