def upload_test_mapped_listens_to_hdfs(cls, listens_path, mapping_path, mapped_listens_path):
        partial_listen_df = dataframe_utils.get_listens_for_training_model_window(cls.date, cls.date, listens_path)
        df = utils.read_files_from_HDFS(mapping_path)
        mapping_df = mapping_utils.get_unique_rows_from_mapping(df)

        mapped_listens = dataframe_utils.get_mapped_artist_and_recording_mbids(partial_listen_df, mapping_df)
        utils.save_parquet(mapped_listens, mapped_listens_path)
    def test_get_mapped_artist_and_recording_mbids(self):
        to_date = get_latest_listen_ts()
        partial_listen_df = dataframe_utils.get_listens_for_training_model_window(to_date, to_date, self.listens_path)

        df = utils.read_files_from_HDFS(self.mapping_path)
        mapping_df = mapping_utils.get_unique_rows_from_mapping(df)
        mapped_listens_path = '/mapped_listens.parquet'

        mapped_listens = dataframe_utils.get_mapped_artist_and_recording_mbids(partial_listen_df, mapping_df, mapped_listens_path)
        self.assertEqual(mapped_listens.count(), 8)

        cols = [
            'listened_at',
            'mb_artist_credit_id',
            'mb_artist_credit_mbids',
            'mb_recording_mbid',
            'mb_release_mbid',
            'msb_artist_credit_name_matchable',
            'msb_recording_name_matchable',
            'user_name'
        ]

        self.assertListEqual(sorted(cols), sorted(mapped_listens.columns))
        status = utils.path_exists(mapped_listens_path)
        self.assertTrue(status)
 def test_get_listens_for_training_model_window(self):
     to_date = get_latest_listen_ts()
     from_date = stats.offset_days(to_date, 2)
     print(to_date, from_date)
     test_df = dataframe_utils.get_listens_for_training_model_window(to_date, from_date, self.listens_path)
     self.assertIn('artist_name_matchable', test_df.columns)
     self.assertIn('track_name_matchable', test_df.columns)
     self.assertEqual(test_df.count(), 11)
Exemple #4
0
def main(train_model_window=None):

    ti = time.monotonic()
    # dict to save dataframe metadata which would be later merged in model_metadata dataframe.
    metadata = {}
    # "updated" should always be set to False in this script.
    metadata['updated'] = False
    try:
        listenbrainz_spark.init_spark_session('Create Dataframes')
    except SparkSessionNotInitializedException as err:
        current_app.logger.error(str(err), exc_info=True)
        raise

    current_app.logger.info('Fetching listens to create dataframes...')
    to_date, from_date = get_dates_to_train_data(train_model_window)

    metadata['to_date'] = to_date
    metadata['from_date'] = from_date

    partial_listens_df = get_listens_for_training_model_window(to_date, from_date, path.LISTENBRAINZ_DATA_DIRECTORY)
    current_app.logger.info('Listen count from {from_date} to {to_date}: {listens_count}'
                            .format(from_date=from_date, to_date=to_date, listens_count=partial_listens_df.count()))

    current_app.logger.info('Loading mapping from HDFS...')
    df = utils.read_files_from_HDFS(path.MBID_MSID_MAPPING)
    msid_mbid_mapping_df = mapping_utils.get_unique_rows_from_mapping(df)
    current_app.logger.info('Number of distinct rows in the mapping: {}'.format(msid_mbid_mapping_df.count()))

    current_app.logger.info('Mapping listens...')
    mapped_listens_df = get_mapped_artist_and_recording_mbids(partial_listens_df, msid_mbid_mapping_df)
    current_app.logger.info('Listen count after mapping: {}'.format(mapped_listens_df.count()))

    current_app.logger.info('Preparing users data and saving to HDFS...')
    users_df = get_users_dataframe(mapped_listens_df, metadata)

    current_app.logger.info('Preparing recordings data and saving to HDFS...')
    recordings_df = get_recordings_df(mapped_listens_df, metadata)

    current_app.logger.info('Preparing listen data dump and playcounts, saving playcounts to HDFS...')
    listens_df = get_listens_df(mapped_listens_df, metadata)

    save_playcounts_df(listens_df, recordings_df, users_df, metadata)

    metadata['dataframe_id'] = get_dataframe_id(config.DATAFRAME_ID_PREFIX)
    save_dataframe_metadata_to_hdfs(metadata)

    current_app.logger.info('Preparing missing MusicBrainz data...')
    missing_musicbrainz_data_itr = get_data_missing_from_musicbrainz(partial_listens_df, msid_mbid_mapping_df)

    messages = prepare_messages(missing_musicbrainz_data_itr, from_date, to_date, ti)

    return messages
Exemple #5
0
def main(train_model_window, job_type, minimum_listens_threshold=0):

    if job_type == "recommendation_recording":
        paths = {
            "mapped_listens": path.RECOMMENDATION_RECORDING_MAPPED_LISTENS,
            "playcounts": path.RECOMMENDATION_RECORDING_PLAYCOUNTS_DATAFRAME,
            "recordings": path.RECOMMENDATION_RECORDINGS_DATAFRAME,
            "users": path.RECOMMENDATION_RECORDING_USERS_DATAFRAME,
            "metadata": path.RECOMMENDATION_RECORDING_DATAFRAME_METADATA,
            "prefix": "listenbrainz-dataframe-recording-recommendations"
        }
    elif job_type == "similar_users":
        paths = {
            "mapped_listens": path.USER_SIMILARITY_MAPPED_LISTENS,
            "playcounts": path.USER_SIMILARITY_PLAYCOUNTS_DATAFRAME,
            "recordings": path.USER_SIMILARITY_RECORDINGS_DATAFRAME,
            "users": path.USER_SIMILARITY_USERS_DATAFRAME,
            "metadata": path.USER_SIMILARITY_METADATA_DATAFRAME,
            "prefix": "listenbrainz-dataframe-user-similarity"
        }
    else:
        raise SparkException("Invalid job_type parameter received for creating dataframes: " + job_type)

    ti = time.monotonic()
    # dict to save dataframe metadata which would be later merged in model_metadata dataframe.
    metadata = {}
    # "updated" should always be set to False in this script.
    metadata['updated'] = False
    try:
        listenbrainz_spark.init_spark_session('Create Dataframes')
    except SparkSessionNotInitializedException as err:
        current_app.logger.error(str(err), exc_info=True)
        raise

    current_app.logger.info('Fetching listens to create dataframes...')
    to_date, from_date = get_dates_to_train_data(train_model_window)

    metadata['to_date'] = to_date
    metadata['from_date'] = from_date

    partial_listens_df = get_listens_for_training_model_window(to_date, from_date, path.LISTENBRAINZ_DATA_DIRECTORY)
    current_app.logger.info('Listen count from {from_date} to {to_date}: {listens_count}'
                            .format(from_date=from_date, to_date=to_date, listens_count=partial_listens_df.count()))

    current_app.logger.info('Loading mapping from HDFS...')
    df = utils.read_files_from_HDFS(path.MBID_MSID_MAPPING)
    msid_mbid_mapping_df = mapping_utils.get_unique_rows_from_mapping(df)
    current_app.logger.info('Number of distinct rows in the mapping: {}'.format(msid_mbid_mapping_df.count()))

    current_app.logger.info('Mapping listens...')
    mapped_listens_df = get_mapped_artist_and_recording_mbids(partial_listens_df, msid_mbid_mapping_df,
                                                              paths["mapped_listens"])
    current_app.logger.info('Listen count after mapping: {}'.format(mapped_listens_df.count()))

    current_app.logger.info('Preparing users data and saving to HDFS...')
    users_df = get_users_dataframe(mapped_listens_df, metadata, paths["users"])

    current_app.logger.info('Preparing recordings data and saving to HDFS...')
    recordings_df = get_recordings_df(mapped_listens_df, metadata, paths["recordings"])

    current_app.logger.info('Preparing listen data dump and playcounts, saving playcounts to HDFS...')
    listens_df = get_listens_df(mapped_listens_df, metadata)

    save_playcounts_df(listens_df, recordings_df, users_df, minimum_listens_threshold, metadata, paths["playcounts"])

    metadata['dataframe_id'] = get_dataframe_id(paths["prefix"])
    save_dataframe_metadata_to_hdfs(metadata, paths["metadata"])

    current_app.logger.info('Preparing missing MusicBrainz data...')
    missing_musicbrainz_data_itr = get_data_missing_from_musicbrainz(partial_listens_df, msid_mbid_mapping_df)

    messages = prepare_messages(missing_musicbrainz_data_itr, from_date, to_date, ti)

    return messages