def create_tables(mb_conn):
    """
        Create tables needed to create the year mapping. First
        is the temp table that the results will be stored in (in order
        to not conflict with the production version of this table).
        Second its format sort table to enables us to sort releases
        according to preferred format, release date and type.
    """

    # drop/create finished table
    try:
        with mb_conn.cursor() as curs:
            curs.execute("DROP TABLE IF EXISTS mapping.tmp_year_mapping")
            curs.execute("""CREATE TABLE mapping.tmp_year_mapping (
                                         recording_name            TEXT NOT NULL,
                                         artist_credit_name        TEXT NOT NULL,
                                         year                      INTEGER)""")
            curs.execute("DROP TABLE IF EXISTS mapping.tmp_year_mapping_release")
            curs.execute("""CREATE TABLE mapping.tmp_year_mapping_release (
                                            id      SERIAL,
                                            release INTEGER)""")
            create_formats_table(mb_conn)
            mb_conn.commit()
    except (OperationalError, UndefinedTable) as err:
        log("year mapping: failed to create recording pair year tables", err)
        mb_conn.rollback()
        raise
def create_tables(mb_conn):
    """
        Create tables needed to create the recording artist pairs. First
        is the temp table that the results will be stored in (in order
        to not conflict with the production version of this table).
        Second its format sort table to enables us to sort releases
        according to preferred format, release date and type. Finally
        a stats table is created if it doesn't exist.
    """

    # drop/create finished table
    try:
        with mb_conn.cursor() as curs:
            curs.execute("DROP TABLE IF EXISTS mapping.tmp_recording_artist_credit_pairs")
            curs.execute("""CREATE TABLE mapping.tmp_recording_artist_credit_pairs (
                                         recording_name            TEXT NOT NULL,
                                         recording_id              INTEGER NOT NULL,
                                         artist_credit_name        TEXT NOT NULL,
                                         artist_credit_id          INTEGER NOT NULL,
                                         release_name              TEXT NOT NULL,
                                         release_id                INTEGER NOT NULL)""")
            curs.execute("DROP TABLE IF EXISTS mapping.tmp_recording_pair_releases")
            curs.execute("""CREATE TABLE mapping.tmp_recording_pair_releases (
                                            id      SERIAL,
                                            release INTEGER)""")
            create_formats_table(mb_conn)
            create_stats_table(curs)
            mb_conn.commit()
    except (psycopg2.errors.OperationalError, psycopg2.errors.UndefinedTable) as err:
        log("failed to create recording pair tables", err)
        mb_conn.rollback()
        raise
def create_indexes(conn):
    """
        Create indexes for the mapping
    """

    try:
        with conn.cursor() as curs:
            curs.execute(
                """CREATE INDEX tmp_mbid_mapping_idx_artist_credit_recording_name
                                      ON mapping.tmp_mbid_mapping(artist_credit_name, recording_name)"""
            )

            # Remove any duplicate rows so we can create a unique index and not get dups in the results
            curs.execute("""DELETE FROM mapping.tmp_mbid_mapping
                                  WHERE id IN (
                                                SELECT id 
                                                  FROM (
                                                          SELECT id, combined_lookup, score,
                                                                 row_number() OVER (PARTITION BY combined_lookup ORDER BY score)
                                                            FROM mapping.tmp_mbid_mapping
                                                        GROUP BY combined_lookup, score, id
                                                       ) AS q
                                                 WHERE row_number > 1)""")
            curs.execute(
                """CREATE UNIQUE INDEX tmp_mbid_mapping_idx_combined_lookup
                                      ON mapping.tmp_mbid_mapping(combined_lookup)"""
            )
        conn.commit()
    except OperationalError as err:
        log("mbid mapping: failed to mbid mapping", err)
        conn.rollback()
        raise
def swap_table_and_indexes(conn):
    """
        Swap temp tables and indexes for production tables and indexes.
    """

    try:
        with conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as curs:
            curs.execute("DROP TABLE IF EXISTS mapping.mbid_mapping_releases")
            curs.execute("DROP TABLE IF EXISTS mapping.mbid_mapping")
            curs.execute("""ALTER TABLE mapping.tmp_mbid_mapping
                            RENAME TO mbid_mapping""")
            curs.execute("""ALTER TABLE mapping.tmp_mbid_mapping_releases
                            RENAME TO mbid_mapping_releases""")

            curs.execute(
                """ALTER INDEX mapping.tmp_mbid_mapping_idx_artist_credit_recording_name
                            RENAME TO mbid_mapping_idx_artist_credit_recording_name"""
            )
            curs.execute(
                """ALTER INDEX mapping.tmp_mbid_mapping_idx_combined_lookup
                            RENAME TO mbid_mapping_idx_combined_lookup""")
            curs.execute(
                """ALTER INDEX mapping.tmp_mbid_mapping_releases_idx_release
                            RENAME TO mbid_mapping_releases_idx_release""")
            curs.execute(
                """ALTER INDEX mapping.tmp_mbid_mapping_releases_idx_id
                            RENAME TO mbid_mapping_releases_idx_id""")
        conn.commit()
    except OperationalError as err:
        log("mbid mapping: failed to swap in new mbid mapping tables",
            str(err))
        conn.rollback()
        raise
def create_tables(mb_conn):
    """
        Create tables needed to create the recording artist pairs. First
        is the temp table that the results will be stored in (in order
        to not conflict with the production version of this table).
        Second its format sort table to enables us to sort releases
        according to preferred format, release date and type.
    """

    # drop/create finished table
    try:
        with mb_conn.cursor() as curs:
            curs.execute("DROP TABLE IF EXISTS mapping.tmp_mbid_mapping")
            curs.execute("""CREATE TABLE mapping.tmp_mbid_mapping (
                                         id                        SERIAL,
                                         artist_credit_id          INT NOT NULL,
                                         artist_mbids              UUID[] NOT NULL,
                                         artist_credit_name        TEXT NOT NULL,
                                         release_mbid              UUID NOT NULL,
                                         release_name              TEXT NOT NULL,
                                         recording_mbid            UUID NOT NULL,
                                         recording_name            TEXT NOT NULL,
                                         combined_lookup           TEXT NOT NULL,
                                         score                     INTEGER NOT NULL)""")
            curs.execute(
                "DROP TABLE IF EXISTS mapping.tmp_mbid_mapping_releases")
            curs.execute("""CREATE TABLE mapping.tmp_mbid_mapping_releases (
                                            id      SERIAL,
                                            release INTEGER)""")
            create_formats_table(mb_conn)
            mb_conn.commit()
    except (psycopg2.errors.OperationalError, psycopg2.errors.UndefinedTable) as err:
        log("mbid mapping: failed to mbid mapping tables", err)
        mb_conn.rollback()
        raise
def sync_release_color_table():
    """ Top level function to sync the two CAA and LB cover art tables
        by fetching all rows sorted by caa_id and adding or removing
        cover art as needed. """

    log("cover art sync starting...")
    mb_query = """SELECT caa.id AS caa_id
                       , release AS release_id
                       , release.gid AS release_mbid
                       , mime_type
                    FROM cover_art_archive.cover_art caa
                    JOIN cover_art_archive.cover_art_type cat
                      ON cat.id = caa.id
                    JOIN musicbrainz.release
                      ON caa.release = release.id
                   WHERE type_id = 1
                     AND caa.id > %s
                ORDER BY caa.id
                   LIMIT %s"""

    lb_query = """ SELECT caa_id
                     FROM release_color
                    WHERE caa_id > %s
                 ORDER BY caa_id
                    LIMIT %s"""

    compare_coverart(mb_query, lb_query, 0, 0, "caa_id", "caa_id")
Exemple #7
0
def create_table(conn):
    """
        Given a valid postgres connection, create temporary tables to generate
        the mapping into. These temporary tables will later be swapped out
        with the producton tables in a single transaction.
    """

    try:
        with conn.cursor() as curs:
            curs.execute("DROP TABLE IF EXISTS mapping.tmp_msid_mbid_mapping")
            curs.execute("""CREATE TABLE mapping.tmp_msid_mbid_mapping (
                                         count INTEGER,
                                         msb_artist_name     TEXT,
                                         msb_artist_msid     UUID,
                                         msb_recording_name  TEXT,
                                         msb_recording_msid  UUID,
                                         msb_release_name    TEXT,
                                         msb_release_msid    UUID,
                                         mb_artist_name      TEXT,
                                         mb_artist_credit_id INTEGER,
                                         mb_recording_name   TEXT,
                                         mb_recording_id     INTEGER,
                                         mb_release_name     TEXT,
                                         mb_release_id       INTEGER,
                                         source              TEXT)""")
            create_stats_table(curs)
            conn.commit()
    except DuplicateTable as err:
        log("Cannot drop/create tables: ", str(err))
        conn.rollback()
        raise
def swap_table_and_indexes(conn):
    """
        Swap temp tables and indexes for production tables and indexes.
    """

    try:
        with conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as curs:
            curs.execute("DROP TABLE mapping.recording_pair_releases")
            curs.execute("DROP TABLE mapping.recording_artist_credit_pairs")
            curs.execute("""ALTER TABLE mapping.tmp_recording_artist_credit_pairs
                              RENAME TO recording_artist_credit_pairs""")
            curs.execute("""ALTER TABLE mapping.tmp_recording_pair_releases
                              RENAME TO recording_pair_releases""")

            curs.execute("""ALTER INDEX mapping.tmp_recording_artist_credit_pairs_idx_artist_credit_name
                              RENAME TO recording_artist_credit_pairs_idx_artist_credit_name""")
            curs.execute("""ALTER INDEX mapping.tmp_recording_pair_releases_idx_release
                              RENAME TO recording_pair_releases_idx_release""")
            curs.execute("""ALTER INDEX mapping.tmp_recording_pair_releases_idx_id
                              RENAME TO recording_pair_releases_idx_id""")
        conn.commit()
    except OperationalError as err:
        log("failed to swap in new recording pair tables", str(err))
        conn.rollback()
        raise
Exemple #9
0
def cron_log():
    """
        Print the internal cron log file for debugging purposes.
    """
    if os.path.exists(CRON_LOG_FILE):
        log("Current cron job log file:")
        subprocess.run(["cat", CRON_LOG_FILE])
    else:
        log("Log file is empty")
def build(client, collection_name):

    schema = {
        'name': collection_name,
        'fields': [
          {
            'name':  'combined',
            'type':  'string'
          },
          {
            'name':  'score',
            'type':  'int32'
          },
        ],
        'default_sorting_field': 'score'
    }


    client.collections.create(schema)

    with psycopg2.connect(config.MBID_MAPPING_DATABASE_URI) as conn:
        with conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as curs:

            curs.execute("SELECT max(score) FROM mapping.mbid_mapping")
            max_score = curs.fetchone()[0]

            query = ("""SELECT recording_name,
                               recording_mbid,
                               release_name,
                               release_mbid,
                               artist_credit_id,
                               artist_credit_name,
                               artist_mbids,
                               score
                          FROM mapping.mbid_mapping""")

            curs.execute(query)
            documents = []
            for i, row in enumerate(curs):
                document = dict(row)
                document['artist_mbids'] = row["artist_mbids"][1:-1]
                document['score'] = max_score - document['score']
                document['combined'] = prepare_string(document['recording_name'] + " " + document['artist_credit_name'])
                documents.append(document)

                if len(documents) == BATCH_SIZE:
                    client.collections[collection_name].documents.import_(documents)
                    documents = []

                if i and i % 1000000 == 0:
                    log("typesense index: Indexed %d rows" % i)

            if documents:
                client.collections[collection_name].documents.import_(documents)

    log("typesense index: indexing complete. waiting for background tasks to finish.")
    time.sleep(5)
def create_indexes(conn):
    """
        Create indexes for the year mapping
    """

    try:
        with conn.cursor() as curs:
            curs.execute("""CREATE INDEX tmp_year_mapping_idx_ac_rec_year
                                      ON mapping.tmp_year_mapping(artist_credit_name, recording_name)""")
        conn.commit()
    except OperationalError as err:
        log("year mapping: failed to create recording pair index", err)
        conn.rollback()
        raise
def create_indexes(conn):
    """
        Create indexes for the recording artist pairs
    """

    try:
        with conn.cursor() as curs:
            curs.execute("""CREATE INDEX tmp_recording_artist_credit_pairs_idx_artist_credit_name
                                      ON mapping.tmp_recording_artist_credit_pairs(artist_credit_name)""")
        conn.commit()
    except OperationalError as err:
        log("failed to create recording pair index", err)
        conn.rollback()
        raise
Exemple #13
0
def create_indexes(mb_conn):
    """ Create the user_name index on the tracks of the year table. """

    try:
        with mb_conn.cursor() as curs:
            curs.execute("""CREATE INDEX tracks_of_the_year_ndx_user_name
                                      ON mapping.tracks_of_the_year (user_name)"""
                         )
            mb_conn.commit()
    except (psycopg2.errors.OperationalError,
            psycopg2.errors.UndefinedTable) as err:
        log("mbid mapping: failed to create tracks of the year indexes", err)
        mb_conn.rollback()
        raise
Exemple #14
0
def load_MB_recordings():
    """
        Load all of the recording artists pairs into ram and sort them for matching.
    """

    mb_recordings = []
    with psycopg2.connect(config.DB_CONNECT_MB) as conn:
        with conn.cursor() as curs:
            query = """SELECT DISTINCT lower(public.unaccent(artist_credit_name::TEXT)) as artist_credit_name, artist_credit_id,
                                       lower(public.unaccent(recording_name::TEXT)) AS recording_name, recording_id,
                                       lower(public.unaccent(release_name::TEXT)) AS release_name, release_id
                         FROM mapping.recording_artist_credit_pairs"""
            if config.USE_MINIMAL_DATASET:
                query += " WHERE artist_credit_id = 1160983"
            curs.execute(query)
            while True:
                mb_row = curs.fetchone()
                if not mb_row:
                    break

                artist = mb_row[0]
                artist_credit_id = int(mb_row[1])
                recording = mb_row[2]
                recording_id = int(mb_row[3])
                release = mb_row[4]
                release_id = int(mb_row[5])
                if config.REMOVE_NON_WORD_CHARS:
                    artist = re.sub(r'\W+', '', artist)
                    recording = re.sub(r'\W+', '', recording)
                    release = re.sub(r'\W+', '', release)

                mb_recordings.append({
                    "artist_name": artist,
                    "artist_credit_id": artist_credit_id,
                    "recording_name": recording,
                    "recording_id": recording_id,
                    "release_name": release,
                    "release_id": release_id,
                })

    log("loaded %d MB recordings, now sorting" % len(mb_recordings))
    mb_recording_index = list(range(len(mb_recordings)))
    mb_recording_index = sorted(mb_recording_index,
                                key=lambda rec:
                                (mb_recordings[rec]["artist_name"],
                                 mb_recordings[rec]["recording_name"]))

    return (mb_recordings, mb_recording_index)
Exemple #15
0
def fetch_tracks_listened_to(lb_conn, mb_conn, start_ts, end_ts):
    """ Actually fetch the top discoveries for the given year and set of users """

    with lb_conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as lb_curs:
        with mb_conn.cursor(
                cursor_factory=psycopg2.extras.DictCursor) as mb_curs:
            log("create tracks listened table")
            create_table(mb_conn)

            log("fetch tracks listened to")
            # Fetch the basic data for all tracks that users listened to in a given year.
            query = """SELECT user_name
                            , m.recording_mbid
                            , md.recording_name
                            , md.artist_credit_name
                            , md.artist_mbids
                            , count(*) AS listen_count
                         FROM listen l
                         JOIN mbid_mapping m
                           ON data->'track_metadata'->'additional_info'->>'recording_msid' = m.recording_msid::TEXT
                         JOIN mbid_mapping_metadata md
                           ON m.recording_mbid = md.recording_mbid
                        WHERE listened_at >= %s
                          AND listened_at < %s
                          AND m.recording_mbid is not null
                     GROUP BY m.recording_mbid, md.recording_name,
                              md.artist_credit_name, md.artist_mbids, user_name""" % (
                start_ts, end_ts)

            to_insert = []
            lb_curs.execute(query)
            while True:
                row = lb_curs.fetchone()
                if not row:
                    break

                to_insert.append(row)
                if len(to_insert) >= BATCH_SIZE:
                    insert_rows(mb_curs, "mapping.tracks_of_the_year",
                                to_insert)
                    to_insert = []
                    mb_conn.commit()

            insert_rows(mb_curs, "mapping.tracks_of_the_year", to_insert)
            mb_conn.commit()
def fetch_top_discoveries_for_users(lb_conn, mb_conn, year):
    """ Actually fetch the top discoveries for the given year"""

    with lb_conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as lb_curs:
        with mb_conn.cursor(
                cursor_factory=psycopg2.extras.DictCursor) as mb_curs:
            log("crate top_listens table")
            create_table(mb_conn)

            log("fetch active users")
            user_list = fetch_user_list(lb_conn, year)
            log("Process %d users." % len(user_list))

            # This query will select all listens/data for a given user list and create an array of the years when
            # a track was listened to. This data directly is not useful for anything directly, but from this
            # a few types of playlists can be generated.
            query = """SELECT user_name
                            , track_name
                            , data->'track_metadata'->>'artist_name' AS artist_name
                            , array_agg(extract(year from to_timestamp(listened_at))::INT ORDER BY
                                        extract(year from to_timestamp(listened_at))::INT) AS years
                            , m.recording_mbid
                            , mm.artist_mbids
                         FROM listen
              FULL OUTER JOIN mbid_mapping m
                           ON (data->'track_metadata'->'additional_info'->>'recording_msid')::uuid = m.recording_msid
              FULL OUTER JOIN mbid_mapping_metadata mm
                              ON mm.recording_mbid = m.recording_mbid
                        WHERE user_name in %s
                          AND mm.recording_mbid IS NOT NULL
                     GROUP BY user_name, artist_name, mm.artist_mbids, track_name, m.recording_mbid
                       HAVING (array_agg(extract(year from to_timestamp(listened_at))::INT ORDER BY
                                         extract(year from to_timestamp(listened_at))::INT))[1] = %s
                     ORDER BY user_name, array_length(array_agg(extract(year from to_timestamp(listened_at))::INT), 1) DESC"""

            for users in chunks(user_list, USERS_PER_BATCH):
                log(users)
                lb_curs.execute(query, (tuple(users), year))

                top_recordings = []
                while True:
                    row = lb_curs.fetchone()
                    if not row:
                        break

                    if len(row["years"]) > 2:
                        top_recordings.append(
                            (row["recording_mbid"], row["track_name"],
                             row["artist_name"], row["artist_mbids"],
                             len(row["years"]), row["user_name"]))

                print("insert %d rows" % len(top_recordings))
                insert_rows(mb_curs, "mapping.top_discoveries", top_recordings)
                mb_conn.commit()
def incremental_update_release_color_table():
    """ Incrementally update the cover art mapping. This is designed to run hourly
        and save a last_updated timestamp in the cache. If the cache value cannot be
        found, a complete sync is run instead and the cache value is set. """

    cache.init(host=config.REDIS_HOST, port=config.REDIS_PORT,
               namespace=config.REDIS_NAMESPACE)

    try:
        last_updated = cache.get(LAST_UPDATED_CACHE_KEY, decode=True) or None
    except Exception:
        last_updated = None

    if not last_updated:
        log("No timestamp found, performing full sync")
        sync_release_color_table()
        last_updated = get_last_updated_from_caa()
        cache.set(LAST_UPDATED_CACHE_KEY, last_updated,
                  expirein=0, encode=True)
        return

    log("cover art incremental update starting...")
    mb_query = """SELECT caa.id AS caa_id
                       , release AS release_id
                       , release.gid AS release_mbid
                       , mime_type
                       , date_uploaded
                    FROM cover_art_archive.cover_art caa
                    JOIN cover_art_archive.cover_art_type cat
                      ON cat.id = caa.id
                    JOIN musicbrainz.release
                      ON caa.release = release.id
                   WHERE type_id = 1
                     AND caa.date_uploaded > %s
                ORDER BY caa.date_uploaded
                   LIMIT %s"""

    compare_coverart(mb_query, None, last_updated, None,
                     "date_uploaded", "last_updated")

    last_updated = get_last_updated_from_caa()
    cache.set(LAST_UPDATED_CACHE_KEY, last_updated, expirein=0, encode=True)
Exemple #18
0
def swap_table_and_indexes(conn):
    """
        This function swaps the temporary files that the mapping was written for the
        production tables, inside a single transaction. This should isolate the
        end users from ever seeing any down time in mapping availability.
    """

    try:
        with conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as curs:
            curs.execute("DROP TABLE mapping.msid_mbid_mapping")
            curs.execute("""ALTER TABLE mapping.tmp_msid_mbid_mapping
                              RENAME TO msid_mbid_mapping""")
            curs.execute(
                """ALTER INDEX mapping.tmp_msid_mbid_mapping_idx_msb_recording_name
                              RENAME TO msid_mbid_mapping_idx_msb_recording_name"""
            )
            curs.execute(
                """ALTER INDEX mapping.tmp_msid_mbid_mapping_idx_msb_recording_msid
                              RENAME TO msid_mbid_mapping_idx_msb_recording_msid"""
            )
            curs.execute(
                """ALTER INDEX mapping.tmp_msid_mbid_mapping_idx_msb_artist_name
                              RENAME TO msid_mbid_mapping_idx_msb_artist_name"""
            )
            curs.execute(
                """ALTER INDEX mapping.tmp_msid_mbid_mapping_idx_msb_artist_msid
                              RENAME TO msid_mbid_mapping_idx_msb_artist_msid"""
            )
            curs.execute(
                """ALTER INDEX mapping.tmp_msid_mbid_mapping_idx_msb_release_name
                              RENAME TO msid_mbid_mapping_idx_msb_release_name"""
            )
            curs.execute(
                """ALTER INDEX mapping.tmp_msid_mbid_mapping_idx_msb_release_msid
                              RENAME TO msid_mbid_mapping_idx_msb_release_msid"""
            )
        conn.commit()
    except OperationalError as err:
        log("failed to swap in new mapping table", str(err))
        conn.rollback()
        raise
Exemple #19
0
def create_table(mb_conn):
    """ Create the tracks of the year table in the mapping schema of a docker-musicbrinz
        instance. """

    try:
        with mb_conn.cursor() as curs:
            curs.execute("DROP TABLE IF EXISTS mapping.tracks_of_the_year")
            curs.execute("""CREATE TABLE mapping.tracks_of_the_year
                                       ( user_name          TEXT NOT NULL
                                       , recording_mbid     UUID NOT NULL
                                       , recording_name     TEXT NOT NULL
                                       , artist_credit_name TEXT NOT NULL
                                       , artist_mbids       UUID[] NOT NULL
                                       , listen_count       INTEGER NOT NULL
                                       )""")
            mb_conn.commit()
    except (psycopg2.errors.OperationalError,
            psycopg2.errors.UndefinedTable) as err:
        log("mbid mapping: failed to create tracks of the year table", err)
        mb_conn.rollback()
        raise
def swap_table_and_indexes(conn):
    """
        Swap temp tables and indexes for production tables and indexes.
    """

    try:
        with conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as curs:
            curs.execute("DROP TABLE IF EXISTS mapping.year_mapping")
            curs.execute("DROP TABLE IF EXISTS mapping.year_mapping_release")
            curs.execute("""ALTER TABLE mapping.tmp_year_mapping_release
                              RENAME TO year_mapping_release""")
            curs.execute("""ALTER TABLE mapping.tmp_year_mapping
                              RENAME TO year_mapping""")

            curs.execute("""ALTER INDEX mapping.tmp_year_mapping_idx_ac_rec_year
                              RENAME TO year_mapping_idx_ac_rec_year""")
        conn.commit()
    except OperationalError as err:
        log("year mapping: failed to swap in new recording pair tables", str(err))
        conn.rollback()
        raise
Exemple #21
0
def create_indexes(conn):
    """
        Create the indexes on the mapping.
    """

    try:
        with conn.cursor() as curs:
            curs.execute(
                """CREATE INDEX tmp_msid_mbid_mapping_idx_msb_recording_name
                                      ON mapping.tmp_msid_mbid_mapping(msb_recording_name)"""
            )
            curs.execute(
                """CREATE INDEX tmp_msid_mbid_mapping_idx_msb_recording_msid
                                      ON mapping.tmp_msid_mbid_mapping(msb_recording_msid)"""
            )
            curs.execute(
                """CREATE INDEX tmp_msid_mbid_mapping_idx_msb_artist_name
                                      ON mapping.tmp_msid_mbid_mapping(msb_artist_name)"""
            )
            curs.execute(
                """CREATE INDEX tmp_msid_mbid_mapping_idx_msb_artist_msid
                                      ON mapping.tmp_msid_mbid_mapping(msb_artist_msid)"""
            )
            curs.execute(
                """CREATE INDEX tmp_msid_mbid_mapping_idx_msb_release_name
                                      ON mapping.tmp_msid_mbid_mapping(msb_release_name)"""
            )
            curs.execute(
                """CREATE INDEX tmp_msid_mbid_mapping_idx_msb_release_msid
                                      ON mapping.tmp_msid_mbid_mapping(msb_release_msid)"""
            )
            conn.commit()
    except OperationalError as err:
        conn.rollback()
        log("creating indexes failed.")
        raise
def create_temp_release_table(conn, stats):
    """
        Creates an intermediate table that orders releases by types, format,
        releases date, country and artist_credit. This sorting should in theory
        sort the most desired releases (albums, digital releases, first released)
        over the other types in order to match to the "canonical" releases
        and to also ensure that tracks that came from one release
        will be matched to the same release and will not end up being
        scattered across many releases from the same artist.
    """

    with conn.cursor() as curs:
        log("Create temp release table: select")
        query = """INSERT INTO mapping.tmp_recording_pair_releases (release)
                        SELECT r.id
                          FROM musicbrainz.release_group rg
                          JOIN musicbrainz.release r ON rg.id = r.release_group
                          JOIN musicbrainz.release_country rc ON rc.release = r.id
                          JOIN musicbrainz.medium m ON m.release = r.id
                          JOIN musicbrainz.medium_format mf ON m.format = mf.id
                          JOIN mapping.format_sort fs ON mf.id = fs.format
                          JOIN musicbrainz.artist_credit ac ON rg.artist_credit = ac.id
                          JOIN musicbrainz.release_group_primary_type rgpt ON rg.type = rgpt.id
               FULL OUTER JOIN musicbrainz.release_group_secondary_type_join rgstj ON rg.id = rgstj.release_group
               FULL OUTER JOIN musicbrainz.release_group_secondary_type rgst ON rgstj.secondary_type = rgst.id
                         WHERE rg.artist_credit != 1
                               %s
                         ORDER BY rg.type, rgst.id desc, fs.sort,
                                  to_date(date_year::TEXT || '-' ||
                                          COALESCE(date_month,12)::TEXT || '-' ||
                                          COALESCE(date_day,28)::TEXT, 'YYYY-MM-DD'),
                                  country, rg.artist_credit, rg.name"""

        if config.USE_MINIMAL_DATASET:
            log("Create temp release table: Using a minimal dataset!")
            curs.execute(query % ('AND rg.artist_credit = %d' % TEST_ARTIST_ID))
        else:
            curs.execute(query % "")

        log("Create temp release table: create indexes")
        curs.execute("""CREATE INDEX tmp_recording_pair_releases_idx_release
                                  ON mapping.tmp_recording_pair_releases(release)""")
        curs.execute("""CREATE INDEX tmp_recording_pair_releases_idx_id
                                  ON mapping.tmp_recording_pair_releases(id)""")

        curs.execute("SELECT COUNT(*) from mapping.tmp_recording_pair_releases")
        stats["recording_pair_release_count"] = curs.fetchone()[0]
        curs.execute("SELECT COUNT(*) from musicbrainz.release")
        stats["mb_release_count"] = curs.fetchone()[0]

    return stats
def process_row(row):
    """ Process one CAA query row, by fetching the 250px thumbnail,
        process the color, then import into the DB """

    sleep_duation = 2
    while True:
        headers = {
            'User-Agent': 'ListenBrainz HueSound Color Bot ( [email protected] )'}
        release_mbid, caa_id = row["release_mbid"], row["caa_id"]
        url = f"https://archive.org/download/mbid-{release_mbid}/mbid-{release_mbid}-{caa_id}_thumb250.jpg"
        r = requests.get(url, headers=headers)
        if r.status_code == 200:
            filename = "/tmp/release-colors-%s.img" % get_ident()
            with open(filename, 'wb') as f:
                for chunk in r:
                    f.write(chunk)

            try:
                red, green, blue = process_image(filename, row["mime_type"])
                insert_row(row["release_mbid"], red,
                           green, blue, row["caa_id"])
                log("%s %s: (%s, %s, %s)" %
                      (row["caa_id"], row["release_mbid"], red, green, blue))
            except Exception as err:
                log("Could not process %s" % url)
                log(err)

            os.unlink(filename)
            sleep_duation = 2
            break

        if r.status_code == 403:
            break

        if r.status_code == 404:
            break

        if r.status_code == 429:
            log("Exceeded rate limit. sleeping %d seconds." % sleep_duration)
            sleep(sleep_duration)
            sleep_duration *= 2
            if sleep_duration > 100:
                return

            continue

        if r.status_code == 503:
            log("Service not available. sleeping %d seconds." % sleep_duration)
            sleep(sleep_duration)
            sleep_duration *= 2
            if sleep_duration > 100:
                return
            continue

        log("Unhandled %d" % r.status_code)
        break
def compare_coverart(mb_query, lb_query, mb_caa_index, lb_caa_index, mb_compare_key, lb_compare_key):
    """ The core cover art comparison function. Given two sets of queries, index values, and 
        comparison keys this function can perform a complete sync as well as an incremental update.

        The queries must fetch chunks of data from the MB and LB tables ordered by
        the corresponding compare key. The starting indexes (the current comparison index
        into the data) must be provided and match the type of the comparison keys. """

    with psycopg2.connect(config.MBID_MAPPING_DATABASE_URI) as mb_conn:
        with mb_conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as mb_curs:
            with psycopg2.connect(config.SQLALCHEMY_DATABASE_URI) as lb_conn:
                with lb_conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as lb_curs:

                    mb_count, lb_count = get_cover_art_counts(mb_curs, lb_curs)
                    log("CAA count: %d\n LB count: %d" % (mb_count, lb_count))

                    threads = []
                    mb_row = None
                    lb_row = None

                    mb_rows = []
                    lb_rows = []

                    mb_done = False
                    lb_done = True if lb_query is None else False

                    extra = 0
                    missing = 0
                    processed = 0

                    while True:
                        if len(mb_rows) == 0 and not mb_done:
                            mb_curs.execute(
                                mb_query, (mb_caa_index, SYNC_BATCH_SIZE))
                            mb_rows = mb_curs.fetchall()
                            if len(mb_rows) == 0:
                                mb_done = True

                        if len(lb_rows) == 0 and not lb_done:
                            lb_curs.execute(
                                lb_query, (lb_caa_index, SYNC_BATCH_SIZE))
                            lb_rows = lb_curs.fetchall()
                            if len(lb_rows) == 0:
                                lb_done = True

                        if not mb_row and len(mb_rows) > 0:
                            mb_row = mb_rows.pop(0)

                        if not lb_row and len(lb_rows) > 0:
                            lb_row = lb_rows.pop(0)

                        if not lb_row and not mb_row:
                            break

                        processed += 1
                        if processed % 100000 == 0:
                            log("processed %d of %d: missing %d extra %d" %
                                  (processed, mb_count, missing, extra))

                        # If the item is in MB, but not in LB, add to LB
                        if lb_row is None or mb_row[mb_compare_key] < lb_row[lb_compare_key]:
                            process_cover_art(threads, mb_row)
                            missing += 1
                            mb_caa_index = mb_row[mb_compare_key]
                            mb_row = None
                            continue

                        # If the item is in LB, but not in MB, remove from LB
                        if mb_row is None or mb_row[mb_compare_key] > lb_row[lb_compare_key]:
                            extra += 1
                            delete_from_lb(lb_row[lb_compare_key])
                            lb_caa_index = lb_row[lb_compare_key]
                            lb_row = None
                            continue

                        # If the caa_id is present in both, skip both
                        if mb_row[mb_compare_key] == lb_row[lb_compare_key]:
                            mb_caa_index = mb_row[mb_compare_key]
                            lb_caa_index = lb_row[lb_compare_key]
                            lb_row = None
                            mb_row = None
                            continue

                        assert False

                    join_threads(threads)
                    log( "Finished! added/skipped %d removed %d from release_color" % (missing, extra))

                    mb_count, lb_count = get_cover_art_counts(mb_curs, lb_curs)
                    log("CAA count: %d\n LB count: %d" % (mb_count, lb_count))

                    metrics.init("listenbrainz")
                    metrics.set("listenbrainz-caa-mapper",
                                caa_front_count=mb_count, lb_caa_count=lb_count)
def build(client, collection_name):

    schema = {
        'name':
        collection_name,
        'fields': [
            {
                'name': 'combined',
                'type': 'string'
            },
            {
                'name': 'score',
                'type': 'int32'
            },
        ],
        'default_sorting_field':
        'score'
    }

    client.collections.create(schema)

    with psycopg2.connect(config.DB_CONNECT_MB) as conn:
        with conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as curs:

            curs.execute("SELECT max(score) FROM mapping.mbid_mapping")
            max_score = curs.fetchone()[0]

            query = ("""SELECT recording_name AS recording_name,
                               r.gid AS recording_mbid,
                               release_name AS release_name,
                               rl.gid AS release_mbid,
                               artist_credit_name AS artist_credit_name,
                               artist_credit_id,
                               score
                          FROM mapping.mbid_mapping
                          JOIN recording r
                            ON r.id = recording_id
                          JOIN release rl
                            ON rl.id = release_id""")

            if config.USE_MINIMAL_DATASET:
                query += " WHERE artist_credit_id = 1160983"

            curs.execute(query)
            documents = []
            for i, row in enumerate(curs):
                document = dict(row)
                document['score'] = max_score - document['score']
                document['combined'] = prepare_string(
                    document['recording_name'] + " " +
                    document['artist_credit_name'])
                documents.append(document)

                if len(documents) == BATCH_SIZE:
                    client.collections[collection_name].documents.import_(
                        documents)
                    documents = []

                if i and i % 1000000 == 0:
                    log("typesense index: Indexed %d rows" % i)

            if documents:
                client.collections[collection_name].documents.import_(
                    documents)

    log("typesense index: indexing complete. waiting for background tasks to finish."
        )
    time.sleep(5)
def build_index():

    client = typesense.Client({
        'nodes': [{
            'host': config.TYPESENSE_HOST,
            'port': config.TYPESENSE_PORT,
            'protocol': 'http',
        }],
        'api_key':
        config.TYPESENSE_API_KEY,
        'connection_timeout_seconds':
        1000000
    })

    collection_name = COLLECTION_NAME_PREFIX + datetime.datetime.now(
    ).strftime('%Y%m%d_%H%M%S')
    try:
        log("typesense index: build index '%s'" % collection_name)
        build(client, collection_name)
    except typesense.exceptions.TypesenseClientError as err:
        log("typesense index: Cannot build index: ", str(err))
        return -1

    try:
        latest = COLLECTION_NAME_PREFIX + "latest"
        log("typesense index: alias index '%s' to %s" %
            (collection_name, latest))
        aliased_collection = {"collection_name": collection_name}
        client.aliases.upsert(latest, aliased_collection)
    except typesense.exceptions.TypesenseClientError as err:
        log("typesense index: Cannot build index: ", str(err))
        return -2

    try:
        for collection in client.collections.retrieve():
            if collection["name"] == collection_name:
                continue

            if collection["name"].startswith(COLLECTION_NAME_PREFIX):
                log("typesense index: delete collection '%s'" %
                    collection["name"])
                client.collections[collection["name"]].delete()
            else:
                log("typesense index: ignore collection '%s'" %
                    collection["name"])

    except typesense.exceptions.ObjectNotFound:
        log("typesense index: Failed to delete collection '%s'.", str(err))

    return 0
def create_temp_release_table(conn):
    """
        Creates an intermediate table that orders releases by types, format,
        releases date, country and artist_credit. This sorting should in theory
        sort the most desired releases (albums, digital releases, first released)
        over the other types in order to match to the "canonical" releases
        and to also ensure that tracks that came from one release
        will be matched to the same release and will not end up being
        scattered across many releases from the same artist.
    """

    with conn.cursor() as curs:
        log("mbid mapping temp tables: Create temp release table: select")
        query = """             SELECT r.id AS release
                                  FROM musicbrainz.release_group rg
                                  JOIN musicbrainz.release r ON rg.id = r.release_group
                             LEFT JOIN musicbrainz.release_country rc ON rc.release = r.id
                                  JOIN musicbrainz.medium m ON m.release = r.id
                                  JOIN musicbrainz.medium_format mf ON m.format = mf.id
                                  JOIN mapping.format_sort fs ON mf.id = fs.format
                                  JOIN musicbrainz.artist_credit ac ON rg.artist_credit = ac.id
                                  JOIN musicbrainz.release_group_primary_type rgpt ON rg.type = rgpt.id
                             LEFT JOIN musicbrainz.release_group_secondary_type_join rgstj ON rg.id = rgstj.release_group
                             LEFT JOIN musicbrainz.release_group_secondary_type rgst ON rgstj.secondary_type = rgst.id
                                 WHERE rg.artist_credit != 1
                                       %s
                                 ORDER BY rg.type, rgst.id desc, fs.sort,
                                          to_date(date_year::TEXT || '-' ||
                                                  COALESCE(date_month,12)::TEXT || '-' ||
                                                  COALESCE(date_day,28)::TEXT, 'YYYY-MM-DD'),
                                          country, rg.artist_credit, rg.name"""

        if config.USE_MINIMAL_DATASET:
            log("mbid mapping temp tables: Using a minimal dataset for artist credit pairs"
                )
            curs.execute(query %
                         ('AND rg.artist_credit = %d' % TEST_ARTIST_ID))
        else:
            log("mbid mapping temp tables: Using a full dataset for artist credit pairs"
                )
            curs.execute(query % "")

        # Fetch releases and toss out duplicates -- using DISTINCT in the query above is not possible as it will
        # destroy the sort order we so carefully crafted.
        with conn.cursor() as curs_insert:
            rows = []
            count = 0
            release_index = {}
            for row in curs:
                if row[0] in release_index:
                    continue

                release_index[row[0]] = 1

                count += 1
                rows.append((count, row[0]))
                if len(rows) == BATCH_SIZE:
                    insert_rows(curs_insert,
                                "mapping.tmp_mbid_mapping_releases", rows)
                    rows = []

                if count % 1000000 == 0:
                    log("mbid mapping temp tables: inserted %s rows." % count)

            if rows:
                insert_rows(curs_insert, "mapping.tmp_mbid_mapping_releases",
                            rows)

        log("mbid mapping temp tables: create indexes")
        curs.execute("""CREATE INDEX tmp_mbid_mapping_releases_idx_release
                                  ON mapping.tmp_mbid_mapping_releases(release)"""
                     )
        curs.execute("""CREATE INDEX tmp_mbid_mapping_releases_idx_id
                                  ON mapping.tmp_mbid_mapping_releases(id)""")
        log("mbid mapping temp tables: done")
def create_mbid_mapping():
    """
        This function is the heart of the mbid mapping. It
        calculates the intermediate table and then fetches all the recordings
        from these tables so that duplicate recording-artist pairs all
        resolve to the "canonical" release-artist pairs that make
        them suitable for inclusion in the msid-mapping.
    """

    log("mbid mapping: start")
    with psycopg2.connect(config.DB_CONNECT_MB) as mb_conn:
        with mb_conn.cursor(
                cursor_factory=psycopg2.extras.DictCursor) as mb_curs:

            # Create the dest table (perhaps dropping the old one first)
            log("mbid mapping: create schema")
            create_schema(mb_conn)
            log("mbid mapping: drop old tables, create new tables")
            create_tables(mb_conn)

            create_temp_release_table(mb_conn)
            with mb_conn.cursor() as mb_curs2:
                rows = []
                last_ac_id = None
                artist_recordings = {}
                count = 0
                batch_count = 0
                serial = 1
                log("mbid mapping: fetch recordings")
                mb_curs.execute("""SELECT r.name AS recording_name,
                                          r.gid AS recording_mbid,
                                          ac.name AS artist_credit_name,
                                          ac.id AS artist_credit_id,
                                          rl.name AS release_name,
                                          rl.gid AS release_mbid,
                                          rpr.id AS score
                                     FROM recording r
                                     JOIN artist_credit ac
                                       ON r.artist_credit = ac.id
                                     JOIN artist_credit_name acn
                                       ON ac.id = acn.artist_credit
                                     JOIN track t
                                       ON t.recording = r.id
                                     JOIN medium m
                                       ON m.id = t.medium
                                     JOIN release rl
                                       ON rl.id = m.release
                                     JOIN mapping.tmp_mbid_mapping_releases rpr
                                       ON rl.id = rpr.release
                                LEFT JOIN release_country rc
                                       ON rc.release = rl.id
                                    GROUP BY rpr.id, ac.id, rl.gid, artist_credit_name, r.gid, r.name, release_name
                                    ORDER BY ac.id, rpr.id""")
                while True:
                    row = mb_curs.fetchone()
                    if not row:
                        break

                    if not last_ac_id:
                        last_ac_id = row['artist_credit_id']

                    if row['artist_credit_id'] != last_ac_id:
                        # insert the rows that made it
                        rows.extend(artist_recordings.values())
                        artist_recordings = {}

                        if len(rows) > BATCH_SIZE:
                            insert_rows(mb_curs2, "mapping.tmp_mbid_mapping",
                                        rows)
                            count += len(rows)
                            mb_conn.commit()
                            rows = []
                            batch_count += 1

                            if batch_count % 200 == 0:
                                log("mbid mapping: inserted %d rows." % count)

                    try:
                        recording_name = row['recording_name']
                        artist_credit_name = row['artist_credit_name']
                        release_name = row['release_name']
                        combined_lookup = unidecode(
                            re.sub(r'[^\w]+', '', artist_credit_name +
                                   recording_name).lower())
                        if recording_name not in artist_recordings:
                            artist_recordings[recording_name] = (
                                serial, recording_name, row['recording_mbid'],
                                artist_credit_name, row['artist_credit_id'],
                                release_name, row['release_mbid'],
                                combined_lookup, row['score'])
                            serial += 1
                    except TypeError:
                        log(row)
                        raise

                    last_ac_id = row['artist_credit_id']

                rows.extend(artist_recordings.values())
                if rows:
                    insert_rows(mb_curs2, "mapping.tmp_mbid_mapping", rows)
                    mb_conn.commit()
                    count += len(rows)

            log("mbid mapping: inserted %d rows total." % count)
            log("mbid mapping: create indexes")
            create_indexes(mb_conn)

            log("mbid mapping: swap tables and indexes into production.")
            swap_table_and_indexes(mb_conn)

    log("mbid mapping: done")
Exemple #29
0
def cron_log():
    if os.path.exists(CRON_LOG_FILE):
        log("Current cron job log file:")
        subprocess.run(["cat", CRON_LOG_FILE])
    else:
        log("Log file is empty")
def create_pairs():
    """
        This function is the heart of the recording artist pair mapping. It
        calculates the intermediate table and then fetches all the recordings
        from these tables so that duplicate recording-artist pairs all
        resolve to the "canonical" release-artist pairs that make
        them suitable for inclusion in the msid-mapping.
    """

    stats = {}
    stats["started"] = datetime.datetime.utcnow().isoformat()
    stats["git commit hash"] = subprocess.getoutput("git rev-parse HEAD")

    with psycopg2.connect(config.DB_CONNECT_MB) as mb_conn:
        with mb_conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as mb_curs:

            # Create the dest table (perhaps dropping the old one first)
            log("Create pairs: drop old tables, create new tables")
            create_schema(mb_conn)
            create_tables(mb_conn)

            stats = create_temp_release_table(mb_conn, stats)

            mb_curs.execute("SELECT COUNT(*) from musicbrainz.recording")
            stats["mb_recording_count"] = mb_curs.fetchone()[0]

            with mb_conn.cursor() as mb_curs2:

                rows = []
                last_ac_id = None
                artist_recordings = {}
                count = 0
                log("Create pairs: fetch recordings")
                mb_curs.execute("""SELECT lower(musicbrainz.musicbrainz_unaccent(r.name)) AS recording_name,
                                          r.id AS recording_id,
                                          lower(musicbrainz.musicbrainz_unaccent(ac.name)) AS artist_credit_name,
                                          ac.id AS artist_credit_id,
                                          lower(musicbrainz.musicbrainz_unaccent(rl.name)) AS release_name,
                                          rl.id as release_id,
                                          rpr.id
                                     FROM recording r
                                     JOIN artist_credit ac ON r.artist_credit = ac.id
                                     JOIN artist_credit_name acn ON ac.id = acn.artist_credit
                                     JOIN track t ON t.recording = r.id
                                     JOIN medium m ON m.id = t.medium
                                     JOIN release rl ON rl.id = m.release
                                     JOIN mapping.tmp_recording_pair_releases rpr ON rl.id = rpr.release
                                    GROUP BY rpr.id, ac.id, rl.id, artist_credit_name, r.id, r.name, release_name
                                    ORDER BY ac.id, rpr.id""")
                log("Create pairs: Insert rows into DB.")
                while True:
                    row = mb_curs.fetchone()
                    if not row:
                        break

                    if not last_ac_id:
                        last_ac_id = row['artist_credit_id']

                    if row['artist_credit_id'] != last_ac_id:
                        # insert the rows that made it
                        rows.extend(artist_recordings.values())
                        artist_recordings = {}

                        if len(rows) > BATCH_SIZE:
                            insert_rows(mb_curs2, "mapping.tmp_recording_artist_credit_pairs", rows)
                            count += len(rows)
                            mb_conn.commit()
                            log("Create pairs: inserted %d rows." % count)
                            rows = []

                    recording_name = row['recording_name']
                    artist_credit_name = row['artist_credit_name']
                    release_name = row['release_name']
                    if config.REMOVE_NON_WORD_CHARS:
                        recording_name = re.sub(r'\W+', '', recording_name)
                    if recording_name not in artist_recordings:
                        if config.REMOVE_NON_WORD_CHARS:
                            artist_credit_name = re.sub(r'\W+', '', artist_credit_name)
                            release_name = re.sub(r'\W+', '', release_name)
                        artist_recordings[recording_name] = (recording_name, row['recording_id'],
                                                             artist_credit_name, row['artist_credit_id'],
                                                             release_name, row['release_id'])

                    last_ac_id = row['artist_credit_id']

                rows.extend(artist_recordings.values())
                if rows:
                    insert_rows(mb_curs2, "mapping.tmp_recording_artist_credit_pairs", rows)
                    mb_conn.commit()
                    count += len(rows)

            log("Create pairs: inserted %d rows total." % count)
            stats["recording_artist_pair_count"] = count

            log("Create pairs: create indexes")
            create_indexes(mb_conn)

            log("Create pairs: swap tables and indexes into production.")
            swap_table_and_indexes(mb_conn)

    stats["completed"] = datetime.datetime.utcnow().isoformat()
    with psycopg2.connect(config.DB_CONNECT_MB) as conn:
        with conn.cursor() as curs:
            curs.execute("""INSERT INTO mapping.mapping_stats (stats) VALUES (%s)""", ((ujson.dumps(stats),)))
        conn.commit()

    log("done")
    print()