Ejemplo n.º 1
0
    def upload_test_mapped_listens_to_HDFS(cls):
        partial_listen_df = create_dataframes.get_listens_for_training_model_window(
            cls.date, cls.date, {}, cls.listens_path)
        mapping_df = utils.read_files_from_HDFS(cls.mapping_path)

        mapped_listens = create_dataframes.get_mapped_artist_and_recording_mbids(
            partial_listen_df, mapping_df)
        utils.save_parquet(mapped_listens, cls.mapped_listens_path)
Ejemplo n.º 2
0
 def test_get_listens_for_training_model_window(self):
     metadata = {}
     test_df = create_dataframes.get_listens_for_training_model_window(
         self.date, self.date, metadata, self.listens_path)
     self.assertEqual(metadata['to_date'], self.date)
     self.assertEqual(metadata['from_date'], self.date)
     self.assertNotIn('artist_mbids', test_df.columns)
     self.assertNotIn('recording_mbid', test_df.columns)
    def upload_test_mapped_listens_to_HDFS(cls):
        partial_listen_df = create_dataframes.get_listens_for_training_model_window(
            cls.date, cls.date, {}, LISTENS_PATH)
        mapping_df = utils.read_files_from_HDFS(MAPPING_PATH)

        mapped_df = create_dataframes.get_mapped_artist_and_recording_mbids(
            partial_listen_df, mapping_df)
        utils.save_parquet(mapped_df, MAPPED_LISTENS_PATH)
 def test_get_listens_for_training_model_window(self):
     metadata = {}
     to_date = get_latest_listen_ts()
     from_date = stats.offset_days(to_date, 2)
     test_df = create_dataframes.get_listens_for_training_model_window(
         to_date, from_date, metadata, self.listens_path)
     self.assertEqual(metadata['to_date'], to_date)
     self.assertEqual(metadata['from_date'], from_date)
     self.assertNotIn('artist_mbids', test_df.columns)
     self.assertNotIn('recording_mbid', test_df.columns)
    def test_prepare_messages(self):
        partial_listen_df = create_dataframes.get_listens_for_training_model_window(
            self.date, self.date, {}, self.listens_path)
        mapping_df = utils.read_files_from_HDFS(self.mapping_path)
        from_date = datetime(2019, 6, 21)
        to_date = datetime(2019, 8, 21)
        ti = time.monotonic()

        itr = create_dataframes.get_data_missing_from_musicbrainz(
            partial_listen_df, mapping_df)

        messages = create_dataframes.prepare_messages(itr, from_date, to_date,
                                                      ti)

        received_first_mssg = messages.pop(0)

        self.assertEqual(received_first_mssg['type'],
                         'cf_recording_dataframes')
        self.assertEqual(received_first_mssg['from_date'],
                         str(from_date.strftime('%b %Y')))
        self.assertEqual(received_first_mssg['to_date'],
                         str(to_date.strftime('%b %Y')))
        self.assertIsInstance(received_first_mssg['dataframe_upload_time'],
                              str)
        self.assertIsInstance(received_first_mssg['total_time'], str)

        expected_missing_mb_data = [{
            'type':
            'missing_musicbrainz_data',
            'musicbrainz_id':
            'vansika',
            'missing_musicbrainz_data': [{
                'artist_msid': 'a36d6fc9-49d0-4789-a7dd-a2b72369ca45',
                'artist_name': 'Less Than Jake',
                'listened_at': '2019-01-13 00:00:00',
                'recording_msid': 'cb6985cd-cc71-4d59-b4fb-2e72796af741',
                'release_msid': '',
                'release_name': 'lala',
                'track_name': "Al's War"
            }, {
                'artist_msid': 'f3e64219-ac00-4b6b-ad15-6e4801cb30a0',
                'artist_name': 'Townes Van Zandt',
                'listened_at': '2019-01-12 00:00:00',
                'recording_msid': '00000465-fcc1-41ab-a735-553f6ce677c4',
                'release_msid': '',
                'release_name':
                'Sunshine Boy: The Unheard Studio Sessions & Demos 1971 - 1972',
                'track_name': 'Dead Flowers'
            }],
            'source':
            'cf'
        }]

        self.assertEqual(expected_missing_mb_data, messages)
Ejemplo n.º 6
0
    def test_get_mapped_artist_and_recording_mbids(self):
        partial_listen_df = create_dataframes.get_listens_for_training_model_window(
            self.date, self.date, {}, self.listens_path)
        mapping_df = utils.read_files_from_HDFS(self.mapping_path)

        mapped_listens = create_dataframes.get_mapped_artist_and_recording_mbids(
            partial_listen_df, mapping_df)
        self.assertEqual(mapped_listens.count(), 1)
        self.assertListEqual(sorted(self.get_mapped_listens().columns),
                             sorted(mapped_listens.columns))
        status = utils.path_exists(path.MAPPED_LISTENS)
        self.assertTrue(status)
    def test_get_mapped_artist_and_recording_mbids(self):
        partial_listen_df = create_dataframes.get_listens_for_training_model_window(
            self.date, self.date, {}, LISTENS_PATH)
        mapping_df = utils.read_files_from_HDFS(MAPPING_PATH)

        mapped_df = create_dataframes.get_mapped_artist_and_recording_mbids(
            partial_listen_df, mapping_df)
        self.assertEqual(mapped_df.count(), 1)
        complete_listen_col = [
            'artist_msid', 'artist_name', 'listened_at', 'recording_msid',
            'release_mbid', 'release_msid', 'release_name', 'tags',
            'track_name', 'user_name', 'mb_artist_credit_id',
            'mb_artist_credit_mbids', 'mb_recording_mbid', 'mb_release_mbid',
            'msb_artist_msid', 'msb_recording_msid', 'msb_release_msid'
        ]
        self.assertListEqual(complete_listen_col, mapped_df.columns)
        status = utils.path_exists(path.MAPPED_LISTENS)
        self.assertTrue(status)
    def test_get_mapped_artist_and_recording_mbids(self):
        partial_listen_df = create_dataframes.get_listens_for_training_model_window(
            self.date, self.date, {}, self.listens_path)
        mapping_df = utils.read_files_from_HDFS(self.mapping_path)

        mapped_listens = create_dataframes.get_mapped_artist_and_recording_mbids(
            partial_listen_df, mapping_df)
        self.assertEqual(mapped_listens.count(), 8)

        cols = [
            'listened_at', 'mb_artist_credit_id', 'mb_artist_credit_mbids',
            'mb_recording_mbid', 'mb_release_mbid',
            'msb_artist_credit_name_matchable', 'msb_recording_name_matchable',
            'user_name'
        ]

        self.assertListEqual(sorted(cols), sorted(mapped_listens.columns))
        status = utils.path_exists(path.MAPPED_LISTENS)
        self.assertTrue(status)
    def test_get_data_missing_from_musicbrainz(self):
        partial_listen_df = create_dataframes.get_listens_for_training_model_window(
            self.date, self.date, {}, self.listens_path)
        mapping_df = utils.read_files_from_HDFS(self.mapping_path)

        itr = create_dataframes.get_data_missing_from_musicbrainz(
            partial_listen_df, mapping_df)

        received_data = []
        for row in itr:
            received_data.append({
                'user_name': 'vansika',
                'artist_msid': row.artist_msid,
                'artist_name': row.artist_name,
                'listened_at': str(row.listened_at),
                'recording_msid': row.recording_msid,
                'release_msid': row.release_msid,
                'release_name': row.release_name,
                'track_name': row.track_name,
            })

        expected_data = [{
            'user_name': 'vansika',
            'artist_msid': 'a36d6fc9-49d0-4789-a7dd-a2b72369ca45',
            'artist_name': 'Less Than Jake',
            'listened_at': '2019-01-13 00:00:00',
            'recording_msid': 'cb6985cd-cc71-4d59-b4fb-2e72796af741',
            'release_msid': '',
            'release_name': 'lala',
            'track_name': "Al's War"
        }, {
            'user_name': 'vansika',
            'artist_msid': 'f3e64219-ac00-4b6b-ad15-6e4801cb30a0',
            'artist_name': 'Townes Van Zandt',
            'listened_at': '2019-01-12 00:00:00',
            'recording_msid': '00000465-fcc1-41ab-a735-553f6ce677c4',
            'release_msid': '',
            'release_name':
            'Sunshine Boy: The Unheard Studio Sessions & Demos 1971 - 1972',
            'track_name': 'Dead Flowers'
        }]

        self.assertEqual(received_data, expected_data)