def test_get_top_artist_candidate_set(self): mapped_listens_df = utils.read_files_from_HDFS( self.mapped_listens_path) recordings_df = create_dataframes.get_recordings_df( mapped_listens_df, {}) users = create_dataframes.get_users_dataframe(mapped_listens_df, {}) mapped_listens_subset = utils.read_files_from_HDFS( self.mapped_listens_subset_path) top_artist_limit = 1 top_artist_df = candidate_sets.get_top_artists(mapped_listens_subset, top_artist_limit, []) top_artist_candidate_set_df, top_artist_candidate_set_df_html = candidate_sets.get_top_artist_candidate_set( top_artist_df, recordings_df, users, mapped_listens_subset) cols = ['recording_id', 'user_id', 'user_name'] self.assertListEqual(sorted(cols), sorted(top_artist_candidate_set_df.columns)) self.assertEqual(top_artist_candidate_set_df.count(), 3) cols = [ 'top_artist_credit_id', 'top_artist_name', 'mb_artist_credit_id', 'mb_artist_credit_mbids', 'mb_recording_mbid', 'msb_artist_credit_name_matchable', 'msb_recording_name_matchable', 'recording_id', 'user_name', 'user_id' ] self.assertListEqual(sorted(cols), sorted(top_artist_candidate_set_df_html.columns)) self.assertEqual(top_artist_candidate_set_df_html.count(), 3)
def test_filter_last_x_days_recordings(self): mapped_listens_df = utils.read_files_from_HDFS( self.mapped_listens_path) mapped_listens_subset = utils.read_files_from_HDFS( self.mapped_listens_subset_path) recordings_df = create_dataframes.get_recordings_df( mapped_listens_df, {}) users = create_dataframes.get_users_dataframe(mapped_listens_df, {}) mapped_listens_subset = utils.read_files_from_HDFS( self.mapped_listens_subset_path) top_artist_limit = 1 top_artist_df = candidate_sets.get_top_artists(mapped_listens_subset, top_artist_limit, []) _, candidate_set_df = candidate_sets.get_top_artist_candidate_set( top_artist_df, recordings_df, users, mapped_listens_subset) df = candidate_sets.filter_last_x_days_recordings( candidate_set_df, mapped_listens_subset) user_name = [row.user_name for row in df.collect()] self.assertEqual(sorted(user_name), ['rob', 'rob', 'vansika_1']) received_recording_mbid = sorted( [row.mb_recording_mbid for row in df.collect()]) expected_recording_mbid = sorted([ "sf5a56f4-1f83-4681-b319-70a734d0d047", "af5a56f4-1f83-4681-b319-70a734d0d047", "sf5a56f4-1f83-4681-b319-70a734d0d047" ]) self.assertEqual(expected_recording_mbid, received_recording_mbid)
def test_get_recordings_dataframe(self): metadata = {} mapped_listens = utils.read_files_from_HDFS(RECOMMENDATION_RECORDING_MAPPED_LISTENS) recordings_df = create_dataframes.get_recordings_df(mapped_listens, metadata, RECOMMENDATION_RECORDINGS_DATAFRAME) self.assertEqual(recordings_df.count(), 20) self.assertCountEqual(['artist_credit_id', 'recording_id', 'recording_mbid'], recordings_df.columns) self.assertEqual(metadata['recordings_count'], 20) status = utils.path_exists(RECOMMENDATION_RECORDINGS_DATAFRAME) self.assertTrue(status)
def test_get_recordings_dataframe(self): metadata = {} mapped_listens = utils.read_files_from_HDFS(self.mapped_listens_path) recordings_df = create_dataframes.get_recordings_df(mapped_listens, metadata) self.assertEqual(recordings_df.count(), 3) self.assertListEqual(sorted(self.get_recordings_df().columns), sorted(recordings_df.columns)) self.assertEqual(metadata['recordings_count'], 3) status = utils.path_exists(path.RECORDINGS_DATAFRAME_PATH) self.assertTrue(status)
def test_save_playcounts_df(self): metadata = {} mapped_listens = utils.read_files_from_HDFS(RECOMMENDATION_RECORDING_MAPPED_LISTENS) users_df = create_dataframes.get_users_dataframe(mapped_listens, {}, RECOMMENDATION_RECORDING_USERS_DATAFRAME) recordings_df = create_dataframes.get_recordings_df(mapped_listens, {}, RECOMMENDATION_RECORDINGS_DATAFRAME) listens_df = create_dataframes.get_listens_df(mapped_listens, {}) create_dataframes.save_playcounts_df(listens_df, recordings_df, users_df, metadata, RECOMMENDATION_RECORDING_PLAYCOUNTS_DATAFRAME) playcounts_df = utils.read_files_from_HDFS(RECOMMENDATION_RECORDING_PLAYCOUNTS_DATAFRAME) self.assertEqual(playcounts_df.count(), 20) self.assertListEqual(['spark_user_id', 'recording_id', 'count'], playcounts_df.columns) self.assertEqual(metadata['playcounts_count'], 20)
def test_save_playcounts_df(self): metadata = {} mapped_listens = utils.read_files_from_HDFS(self.mapped_listens_path) users_df = create_dataframes.get_users_dataframe(mapped_listens, {}) recordings_df = create_dataframes.get_recordings_df(mapped_listens, {}) listens_df = create_dataframes.get_listens_df(mapped_listens, {}) create_dataframes.save_playcounts_df(listens_df, recordings_df, users_df, metadata) playcounts_df = utils.read_files_from_HDFS(path.PLAYCOUNTS_DATAFRAME_PATH) self.assertEqual(playcounts_df.count(), 5) self.assertListEqual(['user_id', 'recording_id', 'count'], playcounts_df.columns) self.assertEqual(metadata['playcounts_count'], playcounts_df.count())
def setUpClass(cls): super(CandidateSetsTestClass, cls).setUpClass() cls.mapped_listens_df = listenbrainz_spark \ .session \ .read \ .parquet("file://" + os.path.join(TEST_DATA_PATH, 'mapped_listens_candidate_sets.parquet')) \ .where("recording_mbid IS NOT NULL") to_date = datetime(2019, 1, 21, tzinfo=timezone.utc) from_date = stats.offset_days(to_date, 4) cls.mapped_listens_subset = candidate_sets.get_listens_to_fetch_top_artists( cls.mapped_listens_df, from_date, to_date) cls.recordings_df = create_dataframes.get_recordings_df( cls.mapped_listens_df, {}, RECOMMENDATION_RECORDINGS_DATAFRAME) cls.users_df = create_dataframes.get_users_dataframe( cls.mapped_listens_df, {}, RECOMMENDATION_RECORDING_USERS_DATAFRAME)
def test_save_playcounts_df(self): metadata = {} mapped_listens = utils.read_files_from_HDFS(self.mapped_listens_path) users_df = create_dataframes.get_users_dataframe( mapped_listens, {}, self.users_path) recordings_df = create_dataframes.get_recordings_df( mapped_listens, {}, self.recordings_path) listens_df = create_dataframes.get_listens_df(mapped_listens, {}) threshold = 0 create_dataframes.save_playcounts_df(listens_df, recordings_df, users_df, threshold, metadata, self.playcounts_path) playcounts_df = utils.read_files_from_HDFS( path.RECOMMENDATION_RECORDING_PLAYCOUNTS_DATAFRAME) self.assertEqual(playcounts_df.count(), 5) self.assertListEqual(['user_id', 'recording_id', 'count'], playcounts_df.columns) self.assertEqual(metadata['playcounts_count'], playcounts_df.count())
def test_get_similar_artist_candidate_set_df(self): mapped_listens_df = utils.read_files_from_HDFS( self.mapped_listens_path) recordings_df = create_dataframes.get_recordings_df( mapped_listens_df, {}, self.recordings_path) users = create_dataframes.get_users_dataframe(mapped_listens_df, {}, self.users_path) mapped_listens_subset = utils.read_files_from_HDFS( self.mapped_listens_subset_path) df = utils.create_dataframe(Row(similar_artist_credit_id=2, similar_artist_name='martinkemp', user_name='rob'), schema=None) similar_artist_df = df.union( utils.create_dataframe(Row(similar_artist_credit_id=2, similar_artist_name='martinkemp', user_name='vansika_1'), schema=None)) similar_artist_candidate_set_df, similar_artist_candidate_set_df_html = candidate_sets.get_similar_artist_candidate_set( similar_artist_df, recordings_df, users, mapped_listens_subset) cols = ['recording_id', 'user_id', 'user_name'] self.assertListEqual(sorted(cols), sorted(similar_artist_candidate_set_df.columns)) self.assertEqual(similar_artist_candidate_set_df.count(), 2) cols = [ 'similar_artist_credit_id', 'similar_artist_name', 'mb_artist_credit_id', 'mb_artist_credit_mbids', 'mb_recording_mbid', 'msb_artist_credit_name_matchable', 'msb_recording_name_matchable', 'recording_id', 'user_name', 'user_id' ] self.assertListEqual( sorted(cols), sorted(similar_artist_candidate_set_df_html.columns)) self.assertEqual(similar_artist_candidate_set_df_html.count(), 2)