def upload_test_mapped_listens_to_HDFS(cls): partial_listen_df = create_dataframes.get_listens_for_training_model_window( cls.date, cls.date, {}, cls.listens_path) mapping_df = utils.read_files_from_HDFS(cls.mapping_path) mapped_listens = create_dataframes.get_mapped_artist_and_recording_mbids( partial_listen_df, mapping_df) utils.save_parquet(mapped_listens, cls.mapped_listens_path)
def test_get_listens_for_training_model_window(self): metadata = {} test_df = create_dataframes.get_listens_for_training_model_window( self.date, self.date, metadata, self.listens_path) self.assertEqual(metadata['to_date'], self.date) self.assertEqual(metadata['from_date'], self.date) self.assertNotIn('artist_mbids', test_df.columns) self.assertNotIn('recording_mbid', test_df.columns)
def upload_test_mapped_listens_to_HDFS(cls): partial_listen_df = create_dataframes.get_listens_for_training_model_window( cls.date, cls.date, {}, LISTENS_PATH) mapping_df = utils.read_files_from_HDFS(MAPPING_PATH) mapped_df = create_dataframes.get_mapped_artist_and_recording_mbids( partial_listen_df, mapping_df) utils.save_parquet(mapped_df, MAPPED_LISTENS_PATH)
def test_get_listens_for_training_model_window(self): metadata = {} to_date = get_latest_listen_ts() from_date = stats.offset_days(to_date, 2) test_df = create_dataframes.get_listens_for_training_model_window( to_date, from_date, metadata, self.listens_path) self.assertEqual(metadata['to_date'], to_date) self.assertEqual(metadata['from_date'], from_date) self.assertNotIn('artist_mbids', test_df.columns) self.assertNotIn('recording_mbid', test_df.columns)
def test_prepare_messages(self): partial_listen_df = create_dataframes.get_listens_for_training_model_window( self.date, self.date, {}, self.listens_path) mapping_df = utils.read_files_from_HDFS(self.mapping_path) from_date = datetime(2019, 6, 21) to_date = datetime(2019, 8, 21) ti = time.monotonic() itr = create_dataframes.get_data_missing_from_musicbrainz( partial_listen_df, mapping_df) messages = create_dataframes.prepare_messages(itr, from_date, to_date, ti) received_first_mssg = messages.pop(0) self.assertEqual(received_first_mssg['type'], 'cf_recording_dataframes') self.assertEqual(received_first_mssg['from_date'], str(from_date.strftime('%b %Y'))) self.assertEqual(received_first_mssg['to_date'], str(to_date.strftime('%b %Y'))) self.assertIsInstance(received_first_mssg['dataframe_upload_time'], str) self.assertIsInstance(received_first_mssg['total_time'], str) expected_missing_mb_data = [{ 'type': 'missing_musicbrainz_data', 'musicbrainz_id': 'vansika', 'missing_musicbrainz_data': [{ 'artist_msid': 'a36d6fc9-49d0-4789-a7dd-a2b72369ca45', 'artist_name': 'Less Than Jake', 'listened_at': '2019-01-13 00:00:00', 'recording_msid': 'cb6985cd-cc71-4d59-b4fb-2e72796af741', 'release_msid': '', 'release_name': 'lala', 'track_name': "Al's War" }, { 'artist_msid': 'f3e64219-ac00-4b6b-ad15-6e4801cb30a0', 'artist_name': 'Townes Van Zandt', 'listened_at': '2019-01-12 00:00:00', 'recording_msid': '00000465-fcc1-41ab-a735-553f6ce677c4', 'release_msid': '', 'release_name': 'Sunshine Boy: The Unheard Studio Sessions & Demos 1971 - 1972', 'track_name': 'Dead Flowers' }], 'source': 'cf' }] self.assertEqual(expected_missing_mb_data, messages)
def test_get_mapped_artist_and_recording_mbids(self): partial_listen_df = create_dataframes.get_listens_for_training_model_window( self.date, self.date, {}, self.listens_path) mapping_df = utils.read_files_from_HDFS(self.mapping_path) mapped_listens = create_dataframes.get_mapped_artist_and_recording_mbids( partial_listen_df, mapping_df) self.assertEqual(mapped_listens.count(), 1) self.assertListEqual(sorted(self.get_mapped_listens().columns), sorted(mapped_listens.columns)) status = utils.path_exists(path.MAPPED_LISTENS) self.assertTrue(status)
def test_get_mapped_artist_and_recording_mbids(self): partial_listen_df = create_dataframes.get_listens_for_training_model_window( self.date, self.date, {}, LISTENS_PATH) mapping_df = utils.read_files_from_HDFS(MAPPING_PATH) mapped_df = create_dataframes.get_mapped_artist_and_recording_mbids( partial_listen_df, mapping_df) self.assertEqual(mapped_df.count(), 1) complete_listen_col = [ 'artist_msid', 'artist_name', 'listened_at', 'recording_msid', 'release_mbid', 'release_msid', 'release_name', 'tags', 'track_name', 'user_name', 'mb_artist_credit_id', 'mb_artist_credit_mbids', 'mb_recording_mbid', 'mb_release_mbid', 'msb_artist_msid', 'msb_recording_msid', 'msb_release_msid' ] self.assertListEqual(complete_listen_col, mapped_df.columns) status = utils.path_exists(path.MAPPED_LISTENS) self.assertTrue(status)
def test_get_mapped_artist_and_recording_mbids(self): partial_listen_df = create_dataframes.get_listens_for_training_model_window( self.date, self.date, {}, self.listens_path) mapping_df = utils.read_files_from_HDFS(self.mapping_path) mapped_listens = create_dataframes.get_mapped_artist_and_recording_mbids( partial_listen_df, mapping_df) self.assertEqual(mapped_listens.count(), 8) cols = [ 'listened_at', 'mb_artist_credit_id', 'mb_artist_credit_mbids', 'mb_recording_mbid', 'mb_release_mbid', 'msb_artist_credit_name_matchable', 'msb_recording_name_matchable', 'user_name' ] self.assertListEqual(sorted(cols), sorted(mapped_listens.columns)) status = utils.path_exists(path.MAPPED_LISTENS) self.assertTrue(status)
def test_get_data_missing_from_musicbrainz(self): partial_listen_df = create_dataframes.get_listens_for_training_model_window( self.date, self.date, {}, self.listens_path) mapping_df = utils.read_files_from_HDFS(self.mapping_path) itr = create_dataframes.get_data_missing_from_musicbrainz( partial_listen_df, mapping_df) received_data = [] for row in itr: received_data.append({ 'user_name': 'vansika', 'artist_msid': row.artist_msid, 'artist_name': row.artist_name, 'listened_at': str(row.listened_at), 'recording_msid': row.recording_msid, 'release_msid': row.release_msid, 'release_name': row.release_name, 'track_name': row.track_name, }) expected_data = [{ 'user_name': 'vansika', 'artist_msid': 'a36d6fc9-49d0-4789-a7dd-a2b72369ca45', 'artist_name': 'Less Than Jake', 'listened_at': '2019-01-13 00:00:00', 'recording_msid': 'cb6985cd-cc71-4d59-b4fb-2e72796af741', 'release_msid': '', 'release_name': 'lala', 'track_name': "Al's War" }, { 'user_name': 'vansika', 'artist_msid': 'f3e64219-ac00-4b6b-ad15-6e4801cb30a0', 'artist_name': 'Townes Van Zandt', 'listened_at': '2019-01-12 00:00:00', 'recording_msid': '00000465-fcc1-41ab-a735-553f6ce677c4', 'release_msid': '', 'release_name': 'Sunshine Boy: The Unheard Studio Sessions & Demos 1971 - 1972', 'track_name': 'Dead Flowers' }] self.assertEqual(received_data, expected_data)