def create_mbid_mapping():
    """
        This function is the heart of the mbid mapping. It
        calculates the intermediate table and then fetches all the recordings
        from these tables so that duplicate recording-artist pairs all
        resolve to the "canonical" release-artist pairs that make
        them suitable for inclusion in the msid-mapping.
    """

    log("mbid mapping: start")
    with psycopg2.connect(config.DB_CONNECT_MB) as mb_conn:
        with mb_conn.cursor(
                cursor_factory=psycopg2.extras.DictCursor) as mb_curs:

            # Create the dest table (perhaps dropping the old one first)
            log("mbid mapping: create schema")
            create_schema(mb_conn)
            log("mbid mapping: drop old tables, create new tables")
            create_tables(mb_conn)

            create_temp_release_table(mb_conn)
            with mb_conn.cursor() as mb_curs2:
                rows = []
                last_ac_id = None
                artist_recordings = {}
                count = 0
                batch_count = 0
                serial = 1
                log("mbid mapping: fetch recordings")
                mb_curs.execute("""SELECT r.name AS recording_name,
                                          r.gid AS recording_mbid,
                                          ac.name AS artist_credit_name,
                                          ac.id AS artist_credit_id,
                                          rl.name AS release_name,
                                          rl.gid AS release_mbid,
                                          rpr.id AS score
                                     FROM recording r
                                     JOIN artist_credit ac
                                       ON r.artist_credit = ac.id
                                     JOIN artist_credit_name acn
                                       ON ac.id = acn.artist_credit
                                     JOIN track t
                                       ON t.recording = r.id
                                     JOIN medium m
                                       ON m.id = t.medium
                                     JOIN release rl
                                       ON rl.id = m.release
                                     JOIN mapping.tmp_mbid_mapping_releases rpr
                                       ON rl.id = rpr.release
                                LEFT JOIN release_country rc
                                       ON rc.release = rl.id
                                    GROUP BY rpr.id, ac.id, rl.gid, artist_credit_name, r.gid, r.name, release_name
                                    ORDER BY ac.id, rpr.id""")
                while True:
                    row = mb_curs.fetchone()
                    if not row:
                        break

                    if not last_ac_id:
                        last_ac_id = row['artist_credit_id']

                    if row['artist_credit_id'] != last_ac_id:
                        # insert the rows that made it
                        rows.extend(artist_recordings.values())
                        artist_recordings = {}

                        if len(rows) > BATCH_SIZE:
                            insert_rows(mb_curs2, "mapping.tmp_mbid_mapping",
                                        rows)
                            count += len(rows)
                            mb_conn.commit()
                            rows = []
                            batch_count += 1

                            if batch_count % 200 == 0:
                                log("mbid mapping: inserted %d rows." % count)

                    try:
                        recording_name = row['recording_name']
                        artist_credit_name = row['artist_credit_name']
                        release_name = row['release_name']
                        combined_lookup = unidecode(
                            re.sub(r'[^\w]+', '', artist_credit_name +
                                   recording_name).lower())
                        if recording_name not in artist_recordings:
                            artist_recordings[recording_name] = (
                                serial, recording_name, row['recording_mbid'],
                                artist_credit_name, row['artist_credit_id'],
                                release_name, row['release_mbid'],
                                combined_lookup, row['score'])
                            serial += 1
                    except TypeError:
                        log(row)
                        raise

                    last_ac_id = row['artist_credit_id']

                rows.extend(artist_recordings.values())
                if rows:
                    insert_rows(mb_curs2, "mapping.tmp_mbid_mapping", rows)
                    mb_conn.commit()
                    count += len(rows)

            log("mbid mapping: inserted %d rows total." % count)
            log("mbid mapping: create indexes")
            create_indexes(mb_conn)

            log("mbid mapping: swap tables and indexes into production.")
            swap_table_and_indexes(mb_conn)

    log("mbid mapping: done")
def create_year_mapping():
    """
        This function is the heart of the recording artist pair year mapping. It
        calculates the intermediate table and then fetches all the recordings
        from these tables so that duplicate recording-artist pairs all
        resolve to the "canonical" release-artist pairs that make
        them suitable for inclusion in the msid-mapping.
    """

    log("year mapping: start")
    with psycopg2.connect(config.DB_CONNECT_MB) as mb_conn:
        with mb_conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as mb_curs:

            # Create the dest table (perhaps dropping the old one first)
            log("year mapping: drop old tables, create new tables")
            create_schema(mb_conn)
            create_tables(mb_conn)

            create_temp_release_table(mb_conn)
            with mb_conn.cursor() as mb_curs2:

                rows = []
                last_ac_id = None
                artist_recordings = {}
                count = 0
                log("year mapping: fetch recordings")
                mb_curs.execute("""SELECT lower(musicbrainz.musicbrainz_unaccent(r.name)) AS recording_name,
                                          lower(musicbrainz.musicbrainz_unaccent(ac.name)) AS artist_credit_name,
                                          ac.id AS artist_credit_id,
                                          date_year AS year
                                     FROM recording r
                                     JOIN artist_credit ac
                                       ON r.artist_credit = ac.id
                                     JOIN artist_credit_name acn
                                       ON ac.id = acn.artist_credit
                                     JOIN track t
                                       ON t.recording = r.id
                                     JOIN medium m
                                       ON m.id = t.medium
                                     JOIN release rl
                                       ON rl.id = m.release
                                     JOIN mapping.tmp_year_mapping_release rpr
                                       ON rl.id = rpr.release
                                LEFT JOIN release_country rc
                                       ON rc.release = rl.id
                                    ORDER BY ac.id, rpr.id""")
                while True:
                    row = mb_curs.fetchone()
                    if not row:
                        break

                    if not last_ac_id:
                        last_ac_id = row['artist_credit_id']

                    if row['artist_credit_id'] != last_ac_id:
                        # insert the rows that made it
                        rows.extend(artist_recordings.values())
                        artist_recordings = {}

                        if len(rows) > BATCH_SIZE:
                            insert_rows(mb_curs2, "mapping.tmp_year_mapping", rows)
                            count += len(rows)
                            mb_conn.commit()
                            rows = []

                    recording_name = row['recording_name']
                    artist_credit_name = row['artist_credit_name']
                    if recording_name not in artist_recordings:
                        artist_recordings[recording_name] = (recording_name.replace("'", "''"),
                                                             artist_credit_name.replace("'", "''"), row['year'])
                    last_ac_id = row['artist_credit_id']

                rows.extend(artist_recordings.values())
                if rows:
                    insert_rows(mb_curs2, "mapping.tmp_year_mapping", rows)
                    mb_conn.commit()
                    count += len(rows)

            log("year mapping: inserted %d rows total." % count)
            create_indexes(mb_conn)

            log("year mapping: swap tables and indexes into production.")
            swap_table_and_indexes(mb_conn)

    log("year mapping: done")
def create_pairs():
    """
        This function is the heart of the recording artist pair mapping. It
        calculates the intermediate table and then fetches all the recordings
        from these tables so that duplicate recording-artist pairs all
        resolve to the "canonical" release-artist pairs that make
        them suitable for inclusion in the msid-mapping.
    """

    stats = {}
    stats["started"] = datetime.datetime.utcnow().isoformat()
    stats["git commit hash"] = subprocess.getoutput("git rev-parse HEAD")

    with psycopg2.connect(config.DB_CONNECT_MB) as mb_conn:
        with mb_conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as mb_curs:

            # Create the dest table (perhaps dropping the old one first)
            log("Create pairs: drop old tables, create new tables")
            create_schema(mb_conn)
            create_tables(mb_conn)

            stats = create_temp_release_table(mb_conn, stats)

            mb_curs.execute("SELECT COUNT(*) from musicbrainz.recording")
            stats["mb_recording_count"] = mb_curs.fetchone()[0]

            with mb_conn.cursor() as mb_curs2:

                rows = []
                last_ac_id = None
                artist_recordings = {}
                count = 0
                log("Create pairs: fetch recordings")
                mb_curs.execute("""SELECT lower(musicbrainz.musicbrainz_unaccent(r.name)) AS recording_name,
                                          r.id AS recording_id,
                                          lower(musicbrainz.musicbrainz_unaccent(ac.name)) AS artist_credit_name,
                                          ac.id AS artist_credit_id,
                                          lower(musicbrainz.musicbrainz_unaccent(rl.name)) AS release_name,
                                          rl.id as release_id,
                                          rpr.id
                                     FROM recording r
                                     JOIN artist_credit ac ON r.artist_credit = ac.id
                                     JOIN artist_credit_name acn ON ac.id = acn.artist_credit
                                     JOIN track t ON t.recording = r.id
                                     JOIN medium m ON m.id = t.medium
                                     JOIN release rl ON rl.id = m.release
                                     JOIN mapping.tmp_recording_pair_releases rpr ON rl.id = rpr.release
                                    GROUP BY rpr.id, ac.id, rl.id, artist_credit_name, r.id, r.name, release_name
                                    ORDER BY ac.id, rpr.id""")
                log("Create pairs: Insert rows into DB.")
                while True:
                    row = mb_curs.fetchone()
                    if not row:
                        break

                    if not last_ac_id:
                        last_ac_id = row['artist_credit_id']

                    if row['artist_credit_id'] != last_ac_id:
                        # insert the rows that made it
                        rows.extend(artist_recordings.values())
                        artist_recordings = {}

                        if len(rows) > BATCH_SIZE:
                            insert_rows(mb_curs2, "mapping.tmp_recording_artist_credit_pairs", rows)
                            count += len(rows)
                            mb_conn.commit()
                            log("Create pairs: inserted %d rows." % count)
                            rows = []

                    recording_name = row['recording_name']
                    artist_credit_name = row['artist_credit_name']
                    release_name = row['release_name']
                    if config.REMOVE_NON_WORD_CHARS:
                        recording_name = re.sub(r'\W+', '', recording_name)
                    if recording_name not in artist_recordings:
                        if config.REMOVE_NON_WORD_CHARS:
                            artist_credit_name = re.sub(r'\W+', '', artist_credit_name)
                            release_name = re.sub(r'\W+', '', release_name)
                        artist_recordings[recording_name] = (recording_name, row['recording_id'],
                                                             artist_credit_name, row['artist_credit_id'],
                                                             release_name, row['release_id'])

                    last_ac_id = row['artist_credit_id']

                rows.extend(artist_recordings.values())
                if rows:
                    insert_rows(mb_curs2, "mapping.tmp_recording_artist_credit_pairs", rows)
                    mb_conn.commit()
                    count += len(rows)

            log("Create pairs: inserted %d rows total." % count)
            stats["recording_artist_pair_count"] = count

            log("Create pairs: create indexes")
            create_indexes(mb_conn)

            log("Create pairs: swap tables and indexes into production.")
            swap_table_and_indexes(mb_conn)

    stats["completed"] = datetime.datetime.utcnow().isoformat()
    with psycopg2.connect(config.DB_CONNECT_MB) as conn:
        with conn.cursor() as curs:
            curs.execute("""INSERT INTO mapping.mapping_stats (stats) VALUES (%s)""", ((ujson.dumps(stats),)))
        conn.commit()

    log("done")
    print()
Exemple #4
0
def create_pairs():

    stats = {}
    stats["started"] = datetime.datetime.utcnow().isoformat()
    stats["git commit hash"] = subprocess.getoutput("git rev-parse HEAD")

    with psycopg2.connect(config.DB_CONNECT_MB) as mb_conn:
        with mb_conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as mb_curs:

            # Create the dest table (perhaps dropping the old one first)
            print("Drop/create pairs table")
            create_schema(mb_conn)
            create_tables(mb_conn)

            print("select releases from MB")
            stats = create_temp_release_table(mb_conn, stats)

            mb_curs.execute("SELECT COUNT(*) from musicbrainz.recording")
            stats["mb_recording_count"] = mb_curs.fetchone()[0]

            with mb_conn.cursor() as mb_curs2:

                rows = []
                last_ac_id = None
                artist_recordings = {}
                count = 0
                print("Run fetch recordings query")
                mb_curs.execute(SELECT_RECORDING_PAIRS_QUERY)
                print("Fetch recordings and insert")
                while True:
                    row = mb_curs.fetchone()
                    if not row:
                        break

                    if not last_ac_id:
                        last_ac_id = row['artist_credit_id']

                    if row['artist_credit_id'] != last_ac_id:
                        # insert the rows that made it
                        rows.extend(artist_recordings.values())
                        artist_recordings = {}

                        if len(rows) > BATCH_SIZE:
                            insert_rows(mb_curs2, "mapping.recording_artist_credit_pairs", rows)
                            count += len(rows)
                            mb_conn.commit()
                            print("inserted %d rows." % count)
                            rows = []

                    recording_name = row['recording_name']
                    artist_credit_name = row['artist_credit_name']
                    release_name = row['release_name']
                    if config.REMOVE_NON_WORD_CHARS:
                        recording_name = re.sub(r'\W+', '', recording_name)
                    if recording_name not in artist_recordings:
                        if config.REMOVE_NON_WORD_CHARS:
                            artist_credit_name = re.sub(r'\W+', '', artist_credit_name)
                            release_name = re.sub(r'\W+', '', release_name)
                        artist_recordings[recording_name] = (recording_name, row['recording_id'], 
                            artist_credit_name, row['artist_credit_id'], release_name, row['release_id'])

                    last_ac_id = row['artist_credit_id']


                rows.extend(artist_recordings.values())
                if rows:
                    insert_rows(mb_curs2, "mapping.recording_artist_credit_pairs", rows)
                    mb_conn.commit()
                    count += len(rows)


            print("inserted %d rows total." % count)
            stats["recording_artist_pair_count"] = count

            print("Create indexes")
            create_indexes(mb_conn)


    stats["completed"] = datetime.datetime.utcnow().isoformat()
    with open("stats/recording-pairs-stats.json", "w") as f:
        f.write(ujson.dumps(stats, indent=2) + "\n")

    print("done")