コード例 #1
0
    def test_get_top_artist_candidate_set(self):
        mapped_listens_df = utils.read_files_from_HDFS(
            self.mapped_listens_path)
        recordings_df = create_dataframes.get_recordings_df(
            mapped_listens_df, {})
        users = create_dataframes.get_users_dataframe(mapped_listens_df, {})
        mapped_listens_subset = utils.read_files_from_HDFS(
            self.mapped_listens_subset_path)

        top_artist_limit = 1
        top_artist_df = candidate_sets.get_top_artists(mapped_listens_subset,
                                                       top_artist_limit, [])

        top_artist_candidate_set_df, top_artist_candidate_set_df_html = candidate_sets.get_top_artist_candidate_set(
            top_artist_df, recordings_df, users, mapped_listens_subset)
        cols = ['recording_id', 'user_id', 'user_name']
        self.assertListEqual(sorted(cols),
                             sorted(top_artist_candidate_set_df.columns))
        self.assertEqual(top_artist_candidate_set_df.count(), 3)

        cols = [
            'top_artist_credit_id', 'top_artist_name', 'mb_artist_credit_id',
            'mb_artist_credit_mbids', 'mb_recording_mbid',
            'msb_artist_credit_name_matchable', 'msb_recording_name_matchable',
            'recording_id', 'user_name', 'user_id'
        ]

        self.assertListEqual(sorted(cols),
                             sorted(top_artist_candidate_set_df_html.columns))
        self.assertEqual(top_artist_candidate_set_df_html.count(), 3)
コード例 #2
0
    def test_filter_last_x_days_recordings(self):
        mapped_listens_df = utils.read_files_from_HDFS(
            self.mapped_listens_path)
        mapped_listens_subset = utils.read_files_from_HDFS(
            self.mapped_listens_subset_path)
        recordings_df = create_dataframes.get_recordings_df(
            mapped_listens_df, {})
        users = create_dataframes.get_users_dataframe(mapped_listens_df, {})
        mapped_listens_subset = utils.read_files_from_HDFS(
            self.mapped_listens_subset_path)

        top_artist_limit = 1
        top_artist_df = candidate_sets.get_top_artists(mapped_listens_subset,
                                                       top_artist_limit, [])

        _, candidate_set_df = candidate_sets.get_top_artist_candidate_set(
            top_artist_df, recordings_df, users, mapped_listens_subset)

        df = candidate_sets.filter_last_x_days_recordings(
            candidate_set_df, mapped_listens_subset)

        user_name = [row.user_name for row in df.collect()]
        self.assertEqual(sorted(user_name), ['rob', 'rob', 'vansika_1'])
        received_recording_mbid = sorted(
            [row.mb_recording_mbid for row in df.collect()])
        expected_recording_mbid = sorted([
            "sf5a56f4-1f83-4681-b319-70a734d0d047",
            "af5a56f4-1f83-4681-b319-70a734d0d047",
            "sf5a56f4-1f83-4681-b319-70a734d0d047"
        ])
        self.assertEqual(expected_recording_mbid, received_recording_mbid)
コード例 #3
0
    def test_get_recordings_dataframe(self):
        metadata = {}
        mapped_listens = utils.read_files_from_HDFS(RECOMMENDATION_RECORDING_MAPPED_LISTENS)
        recordings_df = create_dataframes.get_recordings_df(mapped_listens, metadata, RECOMMENDATION_RECORDINGS_DATAFRAME)
        self.assertEqual(recordings_df.count(), 20)
        self.assertCountEqual(['artist_credit_id', 'recording_id', 'recording_mbid'], recordings_df.columns)
        self.assertEqual(metadata['recordings_count'], 20)

        status = utils.path_exists(RECOMMENDATION_RECORDINGS_DATAFRAME)
        self.assertTrue(status)
コード例 #4
0
    def test_get_recordings_dataframe(self):
        metadata = {}
        mapped_listens = utils.read_files_from_HDFS(self.mapped_listens_path)
        recordings_df = create_dataframes.get_recordings_df(mapped_listens, metadata)
        self.assertEqual(recordings_df.count(), 3)
        self.assertListEqual(sorted(self.get_recordings_df().columns), sorted(recordings_df.columns))
        self.assertEqual(metadata['recordings_count'], 3)

        status = utils.path_exists(path.RECORDINGS_DATAFRAME_PATH)
        self.assertTrue(status)
コード例 #5
0
    def test_save_playcounts_df(self):
        metadata = {}
        mapped_listens = utils.read_files_from_HDFS(RECOMMENDATION_RECORDING_MAPPED_LISTENS)
        users_df = create_dataframes.get_users_dataframe(mapped_listens, {}, RECOMMENDATION_RECORDING_USERS_DATAFRAME)
        recordings_df = create_dataframes.get_recordings_df(mapped_listens, {}, RECOMMENDATION_RECORDINGS_DATAFRAME)
        listens_df = create_dataframes.get_listens_df(mapped_listens, {})

        create_dataframes.save_playcounts_df(listens_df, recordings_df, users_df, metadata, RECOMMENDATION_RECORDING_PLAYCOUNTS_DATAFRAME)
        playcounts_df = utils.read_files_from_HDFS(RECOMMENDATION_RECORDING_PLAYCOUNTS_DATAFRAME)
        self.assertEqual(playcounts_df.count(), 20)

        self.assertListEqual(['spark_user_id', 'recording_id', 'count'], playcounts_df.columns)
        self.assertEqual(metadata['playcounts_count'], 20)
コード例 #6
0
    def test_save_playcounts_df(self):
        metadata = {}
        mapped_listens = utils.read_files_from_HDFS(self.mapped_listens_path)
        users_df = create_dataframes.get_users_dataframe(mapped_listens, {})
        recordings_df = create_dataframes.get_recordings_df(mapped_listens, {})
        listens_df = create_dataframes.get_listens_df(mapped_listens, {})

        create_dataframes.save_playcounts_df(listens_df, recordings_df, users_df, metadata)
        playcounts_df = utils.read_files_from_HDFS(path.PLAYCOUNTS_DATAFRAME_PATH)
        self.assertEqual(playcounts_df.count(), 5)

        self.assertListEqual(['user_id', 'recording_id', 'count'], playcounts_df.columns)
        self.assertEqual(metadata['playcounts_count'], playcounts_df.count())
コード例 #7
0
    def setUpClass(cls):
        super(CandidateSetsTestClass, cls).setUpClass()
        cls.mapped_listens_df = listenbrainz_spark \
            .session \
            .read \
            .parquet("file://" + os.path.join(TEST_DATA_PATH, 'mapped_listens_candidate_sets.parquet')) \
            .where("recording_mbid IS NOT NULL")

        to_date = datetime(2019, 1, 21, tzinfo=timezone.utc)
        from_date = stats.offset_days(to_date, 4)

        cls.mapped_listens_subset = candidate_sets.get_listens_to_fetch_top_artists(
            cls.mapped_listens_df, from_date, to_date)

        cls.recordings_df = create_dataframes.get_recordings_df(
            cls.mapped_listens_df, {}, RECOMMENDATION_RECORDINGS_DATAFRAME)

        cls.users_df = create_dataframes.get_users_dataframe(
            cls.mapped_listens_df, {},
            RECOMMENDATION_RECORDING_USERS_DATAFRAME)
コード例 #8
0
    def test_save_playcounts_df(self):
        metadata = {}
        mapped_listens = utils.read_files_from_HDFS(self.mapped_listens_path)
        users_df = create_dataframes.get_users_dataframe(
            mapped_listens, {}, self.users_path)
        recordings_df = create_dataframes.get_recordings_df(
            mapped_listens, {}, self.recordings_path)
        listens_df = create_dataframes.get_listens_df(mapped_listens, {})

        threshold = 0
        create_dataframes.save_playcounts_df(listens_df, recordings_df,
                                             users_df, threshold, metadata,
                                             self.playcounts_path)
        playcounts_df = utils.read_files_from_HDFS(
            path.RECOMMENDATION_RECORDING_PLAYCOUNTS_DATAFRAME)
        self.assertEqual(playcounts_df.count(), 5)

        self.assertListEqual(['user_id', 'recording_id', 'count'],
                             playcounts_df.columns)
        self.assertEqual(metadata['playcounts_count'], playcounts_df.count())
コード例 #9
0
    def test_get_similar_artist_candidate_set_df(self):
        mapped_listens_df = utils.read_files_from_HDFS(
            self.mapped_listens_path)
        recordings_df = create_dataframes.get_recordings_df(
            mapped_listens_df, {}, self.recordings_path)
        users = create_dataframes.get_users_dataframe(mapped_listens_df, {},
                                                      self.users_path)
        mapped_listens_subset = utils.read_files_from_HDFS(
            self.mapped_listens_subset_path)

        df = utils.create_dataframe(Row(similar_artist_credit_id=2,
                                        similar_artist_name='martinkemp',
                                        user_name='rob'),
                                    schema=None)

        similar_artist_df = df.union(
            utils.create_dataframe(Row(similar_artist_credit_id=2,
                                       similar_artist_name='martinkemp',
                                       user_name='vansika_1'),
                                   schema=None))

        similar_artist_candidate_set_df, similar_artist_candidate_set_df_html = candidate_sets.get_similar_artist_candidate_set(
            similar_artist_df, recordings_df, users, mapped_listens_subset)

        cols = ['recording_id', 'user_id', 'user_name']
        self.assertListEqual(sorted(cols),
                             sorted(similar_artist_candidate_set_df.columns))
        self.assertEqual(similar_artist_candidate_set_df.count(), 2)

        cols = [
            'similar_artist_credit_id', 'similar_artist_name',
            'mb_artist_credit_id', 'mb_artist_credit_mbids',
            'mb_recording_mbid', 'msb_artist_credit_name_matchable',
            'msb_recording_name_matchable', 'recording_id', 'user_name',
            'user_id'
        ]

        self.assertListEqual(
            sorted(cols), sorted(similar_artist_candidate_set_df_html.columns))
        self.assertEqual(similar_artist_candidate_set_df_html.count(), 2)