def upload_test_mapped_listens_to_hdfs(cls, listens_path, mapping_path, mapped_listens_path): partial_listen_df = dataframe_utils.get_listens_for_training_model_window(cls.date, cls.date, listens_path) df = utils.read_files_from_HDFS(mapping_path) mapping_df = mapping_utils.get_unique_rows_from_mapping(df) mapped_listens = dataframe_utils.get_mapped_artist_and_recording_mbids(partial_listen_df, mapping_df) utils.save_parquet(mapped_listens, mapped_listens_path)
def test_get_mapped_artist_and_recording_mbids(self): to_date = get_latest_listen_ts() partial_listen_df = dataframe_utils.get_listens_for_training_model_window(to_date, to_date, self.listens_path) df = utils.read_files_from_HDFS(self.mapping_path) mapping_df = mapping_utils.get_unique_rows_from_mapping(df) mapped_listens_path = '/mapped_listens.parquet' mapped_listens = dataframe_utils.get_mapped_artist_and_recording_mbids(partial_listen_df, mapping_df, mapped_listens_path) self.assertEqual(mapped_listens.count(), 8) cols = [ 'listened_at', 'mb_artist_credit_id', 'mb_artist_credit_mbids', 'mb_recording_mbid', 'mb_release_mbid', 'msb_artist_credit_name_matchable', 'msb_recording_name_matchable', 'user_name' ] self.assertListEqual(sorted(cols), sorted(mapped_listens.columns)) status = utils.path_exists(mapped_listens_path) self.assertTrue(status)
def test_get_listens_for_training_model_window(self): to_date = get_latest_listen_ts() from_date = stats.offset_days(to_date, 2) print(to_date, from_date) test_df = dataframe_utils.get_listens_for_training_model_window(to_date, from_date, self.listens_path) self.assertIn('artist_name_matchable', test_df.columns) self.assertIn('track_name_matchable', test_df.columns) self.assertEqual(test_df.count(), 11)
def main(train_model_window=None): ti = time.monotonic() # dict to save dataframe metadata which would be later merged in model_metadata dataframe. metadata = {} # "updated" should always be set to False in this script. metadata['updated'] = False try: listenbrainz_spark.init_spark_session('Create Dataframes') except SparkSessionNotInitializedException as err: current_app.logger.error(str(err), exc_info=True) raise current_app.logger.info('Fetching listens to create dataframes...') to_date, from_date = get_dates_to_train_data(train_model_window) metadata['to_date'] = to_date metadata['from_date'] = from_date partial_listens_df = get_listens_for_training_model_window(to_date, from_date, path.LISTENBRAINZ_DATA_DIRECTORY) current_app.logger.info('Listen count from {from_date} to {to_date}: {listens_count}' .format(from_date=from_date, to_date=to_date, listens_count=partial_listens_df.count())) current_app.logger.info('Loading mapping from HDFS...') df = utils.read_files_from_HDFS(path.MBID_MSID_MAPPING) msid_mbid_mapping_df = mapping_utils.get_unique_rows_from_mapping(df) current_app.logger.info('Number of distinct rows in the mapping: {}'.format(msid_mbid_mapping_df.count())) current_app.logger.info('Mapping listens...') mapped_listens_df = get_mapped_artist_and_recording_mbids(partial_listens_df, msid_mbid_mapping_df) current_app.logger.info('Listen count after mapping: {}'.format(mapped_listens_df.count())) current_app.logger.info('Preparing users data and saving to HDFS...') users_df = get_users_dataframe(mapped_listens_df, metadata) current_app.logger.info('Preparing recordings data and saving to HDFS...') recordings_df = get_recordings_df(mapped_listens_df, metadata) current_app.logger.info('Preparing listen data dump and playcounts, saving playcounts to HDFS...') listens_df = get_listens_df(mapped_listens_df, metadata) save_playcounts_df(listens_df, recordings_df, users_df, metadata) metadata['dataframe_id'] = get_dataframe_id(config.DATAFRAME_ID_PREFIX) save_dataframe_metadata_to_hdfs(metadata) current_app.logger.info('Preparing missing MusicBrainz data...') missing_musicbrainz_data_itr = get_data_missing_from_musicbrainz(partial_listens_df, msid_mbid_mapping_df) messages = prepare_messages(missing_musicbrainz_data_itr, from_date, to_date, ti) return messages
def main(train_model_window, job_type, minimum_listens_threshold=0): if job_type == "recommendation_recording": paths = { "mapped_listens": path.RECOMMENDATION_RECORDING_MAPPED_LISTENS, "playcounts": path.RECOMMENDATION_RECORDING_PLAYCOUNTS_DATAFRAME, "recordings": path.RECOMMENDATION_RECORDINGS_DATAFRAME, "users": path.RECOMMENDATION_RECORDING_USERS_DATAFRAME, "metadata": path.RECOMMENDATION_RECORDING_DATAFRAME_METADATA, "prefix": "listenbrainz-dataframe-recording-recommendations" } elif job_type == "similar_users": paths = { "mapped_listens": path.USER_SIMILARITY_MAPPED_LISTENS, "playcounts": path.USER_SIMILARITY_PLAYCOUNTS_DATAFRAME, "recordings": path.USER_SIMILARITY_RECORDINGS_DATAFRAME, "users": path.USER_SIMILARITY_USERS_DATAFRAME, "metadata": path.USER_SIMILARITY_METADATA_DATAFRAME, "prefix": "listenbrainz-dataframe-user-similarity" } else: raise SparkException("Invalid job_type parameter received for creating dataframes: " + job_type) ti = time.monotonic() # dict to save dataframe metadata which would be later merged in model_metadata dataframe. metadata = {} # "updated" should always be set to False in this script. metadata['updated'] = False try: listenbrainz_spark.init_spark_session('Create Dataframes') except SparkSessionNotInitializedException as err: current_app.logger.error(str(err), exc_info=True) raise current_app.logger.info('Fetching listens to create dataframes...') to_date, from_date = get_dates_to_train_data(train_model_window) metadata['to_date'] = to_date metadata['from_date'] = from_date partial_listens_df = get_listens_for_training_model_window(to_date, from_date, path.LISTENBRAINZ_DATA_DIRECTORY) current_app.logger.info('Listen count from {from_date} to {to_date}: {listens_count}' .format(from_date=from_date, to_date=to_date, listens_count=partial_listens_df.count())) current_app.logger.info('Loading mapping from HDFS...') df = utils.read_files_from_HDFS(path.MBID_MSID_MAPPING) msid_mbid_mapping_df = mapping_utils.get_unique_rows_from_mapping(df) current_app.logger.info('Number of distinct rows in the mapping: {}'.format(msid_mbid_mapping_df.count())) current_app.logger.info('Mapping listens...') mapped_listens_df = get_mapped_artist_and_recording_mbids(partial_listens_df, msid_mbid_mapping_df, paths["mapped_listens"]) current_app.logger.info('Listen count after mapping: {}'.format(mapped_listens_df.count())) current_app.logger.info('Preparing users data and saving to HDFS...') users_df = get_users_dataframe(mapped_listens_df, metadata, paths["users"]) current_app.logger.info('Preparing recordings data and saving to HDFS...') recordings_df = get_recordings_df(mapped_listens_df, metadata, paths["recordings"]) current_app.logger.info('Preparing listen data dump and playcounts, saving playcounts to HDFS...') listens_df = get_listens_df(mapped_listens_df, metadata) save_playcounts_df(listens_df, recordings_df, users_df, minimum_listens_threshold, metadata, paths["playcounts"]) metadata['dataframe_id'] = get_dataframe_id(paths["prefix"]) save_dataframe_metadata_to_hdfs(metadata, paths["metadata"]) current_app.logger.info('Preparing missing MusicBrainz data...') missing_musicbrainz_data_itr = get_data_missing_from_musicbrainz(partial_listens_df, msid_mbid_mapping_df) messages = prepare_messages(missing_musicbrainz_data_itr, from_date, to_date, ti) return messages