def process_json_listens(self, filename, data_dir, tmp_hdfs_path, schema): """ Process a file containing listens from the ListenBrainz dump and add listens to appropriate dataframes. Args: filename (str): File name of JSON file. data_dir (str): Dir to save listens to in HDFS as parquet. tmp_HDFS_path (str): HDFS path where listens JSON has been uploaded. """ start_time = time.time() df = utils.read_json(tmp_hdfs_path, schema=schema) current_app.logger.info("Processing {} listens...".format(df.count())) if filename.split('/')[-1] == 'invalid.json': dest_path = os.path.join(data_dir, 'invalid.parquet') else: year = filename.split('/')[-2] month = filename.split('/')[-1][0:-5] dest_path = os.path.join(data_dir, year, '{}.parquet'.format(str(month))) current_app.logger.info("Uploading to {}...".format(dest_path)) utils.save_parquet(df, dest_path) current_app.logger.info( "File processed in {:.2f} seconds!".format(time.time() - start_time))
def upload_test_mapped_listens_to_hdfs(cls, listens_path, mapping_path, mapped_listens_path): partial_listen_df = dataframe_utils.get_listens_for_training_model_window(cls.date, cls.date, listens_path) df = utils.read_files_from_HDFS(mapping_path) mapping_df = mapping_utils.get_unique_rows_from_mapping(df) mapped_listens = dataframe_utils.get_mapped_artist_and_recording_mbids(partial_listen_df, mapping_df) utils.save_parquet(mapped_listens, mapped_listens_path)
def test_delete_model(self): df = utils.create_dataframe(Row(col1=1, col2=1), None) utils.save_parquet(df, path.RECOMMENDATION_RECORDING_DATA_DIR) train_models.delete_model() dir_exists = utils.path_exists(path.RECOMMENDATION_RECORDING_DATA_DIR) self.assertFalse(dir_exists)
def save_candidate_sets(top_artist_candidate_set_df, similar_artist_candidate_set_df): """ Save candidate sets to HDFS. Args: top_artist_candidate_set_df (dataframe): recording ids that belong to top artists corresponding to user ids. similar_artist_candidate_set_df (dataframe): recording ids that belong to similar artists corresponding to user ids. """ try: utils.save_parquet( top_artist_candidate_set_df, path.RECOMMENDATION_RECORDING_TOP_ARTIST_CANDIDATE_SET) except FileNotSavedException as err: logger.error(str(err), exc_info=True) raise try: utils.save_parquet( similar_artist_candidate_set_df, path.RECOMMENDATION_RECORDING_SIMILAR_ARTIST_CANDIDATE_SET) except FileNotSavedException as err: logger.error(str(err), exc_info=True) raise
def save_dataframe(self): now = datetime.now() with open(self.path_to_data_file('user_top_artists.json')) as f: data = json.load(f) schema = StructType( (StructField('user_name', StringType()), StructField('artist_name', StringType()), StructField('artist_msid', StringType()), StructField('artist_mbids', ArrayType(StringType())))) df = None for entry in data: for idx in range(0, entry['count']): # Assign listened_at to each listen row = utils.create_dataframe(Row( user_name=entry['user_name'], artist_name=entry['artist_name'], artist_msid=entry['artist_msid'], artist_mbids=entry['artist_mbids']), schema=schema) df = df.union(row) if df else row utils.save_parquet( df, os.path.join(self.path_, '{}/{}.parquet'.format(now.year, now.month)))
def process_json_listens(self, filename, data_dir, tmp_hdfs_path, append, schema): """ Process a file containing listens from the ListenBrainz dump and add listens to appropriate dataframes. Args: filename (str): File name of JSON file. data_dir (str): Dir to save listens to in HDFS as parquet. tmp_hdfs_path (str): HDFS path where listens JSON has been uploaded. append (bool): If true append to end of parquet rather than write. schema: Schema of the listens """ start_time = time.monotonic() df = utils.read_json(tmp_hdfs_path, schema=schema) if filename.split('/')[-1] == 'invalid.json': dest_path = os.path.join(data_dir, 'invalid.parquet') else: year = filename.split('/')[-2] month = filename.split('/')[-1][0:-5] dest_path = os.path.join(data_dir, year, '{}.parquet'.format(str(month))) if append and utils.path_exists(dest_path): utils.save_parquet(df, dest_path, mode="append") else: utils.save_parquet(df, dest_path, mode="overwrite") logger.info("Uploading to {}...".format(dest_path)) logger.info( "File processed in {:.2f} seconds!".format(time.monotonic() - start_time))
def upload_test_mapped_listens_to_HDFS(cls): partial_listen_df = create_dataframes.get_listens_for_training_model_window( cls.date, cls.date, {}, LISTENS_PATH) mapping_df = utils.read_files_from_HDFS(MAPPING_PATH) mapped_df = create_dataframes.get_mapped_artist_and_recording_mbids( partial_listen_df, mapping_df) utils.save_parquet(mapped_df, MAPPED_LISTENS_PATH)
def test_create_dataframe(self): hdfs_path = self.path_ + '/test_df.parquet' df = utils.create_dataframe([Row(column1=1, column2=2)], schema=None) self.assertEqual(df.count(), 1) utils.save_parquet(df, hdfs_path) received_df = utils.read_files_from_HDFS(hdfs_path) self.assertEqual(received_df.count(), 1)
def upload_test_mapped_listens_to_HDFS(cls): partial_listen_df = create_dataframes.get_listens_for_training_model_window( cls.date, cls.date, {}, cls.listens_path) mapping_df = utils.read_files_from_HDFS(cls.mapping_path) mapped_listens = create_dataframes.get_mapped_artist_and_recording_mbids( partial_listen_df, mapping_df) utils.save_parquet(mapped_listens, cls.mapped_listens_path)
def upload_test_mapping_listens_subset_to_hdfs(cls): mapped_df = utils.read_files_from_HDFS(cls.mapped_listens_path) from_date = stats.offset_days(cls.date, 4) to_date = cls.date mapped_listens_subset = candidate_sets.get_listens_to_fetch_top_artists( mapped_df, from_date, to_date) utils.save_parquet(mapped_listens_subset, cls.mapped_listens_subset_path)
def test_save_parquet(self): path_ = 'test_df.parquet' hdfs_path = os.path.join(config.HDFS_CLUSTER_URI, path_) df = utils.create_dataframe(Row(column1=1, column2=2), schema=None) utils.save_parquet(df, hdfs_path) received_df = utils.read_files_from_HDFS(hdfs_path) self.assertEqual(received_df.count(), 1)
def upload_test_mapping_to_hdfs(cls, mapping_path): with open(cls.path_to_data_file('msid_mbid_mapping.json')) as f: data = json.load(f) mapping_df = None for row in data: df = utils.create_dataframe(schema.convert_mapping_to_row(row), schema=schema.msid_mbid_mapping_schema) mapping_df = mapping_df.union(df) if mapping_df else df utils.save_parquet(mapping_df, mapping_path)
def test_get_latest_listen_ts(self): date = datetime(2020, 5, 18) df = utils.create_dataframe(Row(listened_at=date), schema=None) df = df.union( utils.create_dataframe(Row(listened_at=offset_days(date, 7)), schema=None)) utils.save_parquet(df, '{}/2020/5.parquet'.format(self.path_)) result = stats_utils.get_latest_listen_ts() self.assertEqual(date, result)
def test_get_latest_full_dump_no_full(self): """ Test to ensure 'None' is returned if not full import has been made. """ # Remove full dump entries from parquet import_meta_df = read_files_from_HDFS(self.path_) result = import_meta_df.filter(import_meta_df.dump_type != "full") # We have to save the dataframe as a different file and move it as the df itself is read from the file save_parquet(result, '/temp.parquet') delete_dir(self.path_, recursive=True) rename('/temp.parquet', self.path_) self.assertIsNone(import_utils.get_latest_full_dump())
def save_dataframe(self): df = utils.create_dataframe(Row(user_name='user2', artist_name='artist1', artist_msid='1',artist_mbids='1', track_name='test', recording_msid='1', recording_mbid='1', release_name='test',release_msid='1', release_mbid='1'), schema=None) df1 = utils.create_dataframe(Row(user_name='user1',artist_name='artist1', artist_msid='1',artist_mbids='1', track_name='test', recording_msid='1', recording_mbid='1', release_name='test',release_msid='1', release_mbid='1'), schema=None) df2 = utils.create_dataframe(Row(user_name='user1',artist_name='artist1', artist_msid='1',artist_mbids='1', track_name='test', recording_msid='1', recording_mbid='1', release_name='test',release_msid='1', release_mbid='1'), schema=None) df = df.union(df1).union(df2) utils.save_parquet(df, '/data/listenbrainz/2019/12.parquet')
def upload_test_listen_to_hdfs(cls, listens_path): with open(cls.path_to_data_file('listens.json')) as f: data = json.load(f) listens_df = None for row in data: row['listened_at'] = datetime.strptime(row['listened_at'], '%d-%m-%Y') df = utils.create_dataframe(schema.convert_to_spark_json(row), schema=schema.listen_schema) listens_df = listens_df.union(df) if listens_df else df utils.save_parquet(listens_df, listens_path + '/{}/{}.parquet'.format(cls.date.year, cls.date.month))
def save_dataframe(df, dest_path): """ Save dataframe to HDFS. Args: df : Dataframe to save. dest_path (str): HDFS path to save dataframe. """ try: save_parquet(df, dest_path) except FileNotSavedException as err: current_app.logger.error(str(err), exc_info=True) raise
def upload_test_playcounts(cls): schema = StructType([ StructField("user_id", IntegerType()), StructField("recording_id", IntegerType()), StructField("count", IntegerType()) ]) test_playcounts = [] for i in range(1, PLAYCOUNTS_COUNT // 2 + 1): test_playcounts.append([1, 1, 1]) for i in range(PLAYCOUNTS_COUNT // 2 + 1, PLAYCOUNTS_COUNT + 1): test_playcounts.append([2, 2, 1]) test_playcounts_df = listenbrainz_spark.session.createDataFrame( test_playcounts, schema=schema) utils.save_parquet(test_playcounts_df, TEST_PLAYCOUNTS_PATH)
def process_json(self, _, dest_path, tmp_hdfs_path, __, schema): """ Read JSON from HDFS as a dataframe and upload to HDFS as a parquet. Args: dest_path (str): HDFS path to upload JSON as parquet. tmp_hdfs_path (str): HDFS path where JSON has been uploaded. """ start_time = time.monotonic() df = utils.read_json(tmp_hdfs_path, schema=schema) logger.info("Processing {} rows...".format(df.count())) logger.info("Uploading to {}...".format(dest_path)) utils.save_parquet(df, dest_path) logger.info("File processed in {:.2f} seconds!".format(time.monotonic() - start_time))
def upload_test_mapping_to_HDFS(cls): test_mapping = { "msb_recording_msid": "cb6985cd-cc71-4d59-b4fb-2e72796af741", "mb_recording_mbid": "3acb406f-c716-45f8-a8bd-96ca3939c2e5", "msb_artist_msid": "a36d6fc9-49d0-4789-a7dd-a2b72369ca45", "mb_artist_credit_mbids": ["181c4177-f33a-441d-b15d-910acaf18b07"], "mb_artist_credit_id": 2157963, "mb_release_mbid": "xxxxx", "msb_release_msid": "xxxxx" } test_mapping_df = utils.create_dataframe( schema.convert_mapping_to_row(test_mapping), schema.msid_mbid_mapping_schema) utils.save_parquet(test_mapping_df, MAPPING_PATH)
def setUp(self): """ Store the testdata as parquet in HDFS before each test. """ with open(self.path_to_data_file("import_metadata.json")) as f: data = json.load(f) df = None for entry in data: row = create_dataframe(Row(dump_id=entry["dump_id"], dump_type=entry["dump_type"], imported_at=datetime.fromtimestamp(entry["imported_at"])), schema=import_metadata_schema) df = df.union(row) if df else row save_parquet(df, self.path_) return super().setUp()
def test_get_most_recent_model_id(self): model_id_1 = "a36d6fc9-49d0-4789-a7dd-a2b72369ca45" model_metadata_dict_1 = self.get_model_metadata(model_id_1) df_1 = utils.create_dataframe(schema.convert_model_metadata_to_row(model_metadata_dict_1), schema.model_metadata_schema) model_id_2 = "bbbd6fc9-49d0-4789-a7dd-a2b72369ca45" model_metadata_dict_2 = self.get_model_metadata(model_id_2) df_2 = utils.create_dataframe(schema.convert_model_metadata_to_row(model_metadata_dict_2), schema.model_metadata_schema) model_metadata = df_1.union(df_2) utils.save_parquet(model_metadata, path.RECOMMENDATION_RECORDING_MODEL_METADATA) expected_model_id = recommend.get_most_recent_model_id() self.assertEqual(expected_model_id, model_id_2)
def test_get_listens(self): from_date = datetime(2019, 10, 1) to_date = datetime(2019, 11, 1) path_ = 'test_df' hdfs_path = os.path.join(config.HDFS_CLUSTER_URI, path_) df = utils.create_dataframe(Row(column1=1, column2=2), schema=None) dest_path = hdfs_path + '/{}/{}.parquet'.format(from_date.year, from_date.month) utils.save_parquet(df, dest_path) df = utils.create_dataframe(Row(column1=3, column2=4), schema=None) dest_path = hdfs_path + '/{}/{}.parquet'.format(to_date.year, to_date.month) utils.save_parquet(df, dest_path) received_df = utils.get_listens(from_date, to_date, hdfs_path) self.assertEqual(received_df.count(), 2)
def test_get_listens(self): from_date = datetime(2019, 10, 1) to_date = datetime(2019, 11, 1) df = utils.create_dataframe([Row(column1=1, column2=2)], schema=None) dest_path = self.path_ + '/{}/{}.parquet'.format( from_date.year, from_date.month) utils.save_parquet(df, dest_path) df = utils.create_dataframe([Row(column1=3, column2=4)], schema=None) dest_path = self.path_ + '/{}/{}.parquet'.format( to_date.year, to_date.month) utils.save_parquet(df, dest_path) received_df = utils.get_listens(from_date, to_date, self.path_) self.assertEqual(received_df.count(), 2)
def upload_test_listen_to_HDFS(cls): month, year = cls.date.strftime('%m').lstrip('0'), cls.date.strftime( '%Y') test_listen = { "user_name": "vansika", "artist_msid": "a36d6fc9-49d0-4789-a7dd-a2b72369ca45", "artist_name": "Less Than Jake", "artist_mbids": [], "release_mbid": "", "track_name": "Al's War", "recording_msid": "cb6985cd-cc71-4d59-b4fb-2e72796af741", "tags": [], "listened_at": cls.date } test_listens_df = utils.create_dataframe( schema.convert_to_spark_json(test_listen), schema.listen_schema) utils.save_parquet(test_listens_df, LISTENS_PATH + '{}/{}.parquet'.format(year, month))
def insert_dump_data(dump_id: int, dump_type: str, imported_at: datetime): """ Insert information about dump imported """ import_meta_df = None try: import_meta_df = read_files_from_HDFS(IMPORT_METADATA) except PathNotFoundException: current_app.logger.info("Import metadata file not found, creating...") data = create_dataframe(Row(dump_id, dump_type, imported_at), schema=import_metadata_schema) if import_meta_df: result = import_meta_df \ .filter(f"dump_id != '{dump_id}' OR dump_type != '{dump_type}'") \ .union(data) else: result = data # We have to save the dataframe as a different file and move it as the df itself is read from the file save_parquet(result, "/temp.parquet") if path_exists(IMPORT_METADATA): delete_dir(IMPORT_METADATA, recursive=True) rename("/temp.parquet", IMPORT_METADATA)
def upload_test_mapping_to_HDFS(cls): test_mapping = { "msb_recording_msid": "cb6985cd-cc71-4d59-b4fb-2e72796af741", "mb_recording_mbid": "3acb406f-c716-45f8-a8bd-96ca3939c2e5", "msb_artist_msid": "a36d6fc9-49d0-4789-a7dd-a2b72369ca45", "mb_artist_credit_mbids": ["181c4177-f33a-441d-b15d-910acaf18b07"], "mb_artist_credit_id": 2157963, "mb_release_mbid": "xxxxx", "msb_release_msid": "xxxxx", "mb_artist_credit_name": "Less Than Jake", "msb_artist_credit_name_matchable": "lessthanjake", "mb_recording_name": "Al's War", "msb_recording_name_matchable": "alswar", "mb_release_name": "Easier", "msb_release_name_matchable": "easier", } test_mapping_df = utils.create_dataframe( schema.convert_mapping_to_row(test_mapping), schema.msid_mbid_mapping_schema) utils.save_parquet(test_mapping_df, cls.mapping_path)
def test_copy(self): # Test directories utils.create_dir(self.path_) utils.create_dir(os.path.join(self.path_, "a")) utils.create_dir(os.path.join(self.path_, "b")) # DataFrames to create parquets df_a = utils.create_dataframe([Row(column1=1, column2=2)], schema=None) df_b = utils.create_dataframe([Row(column1=3, column2=4)], schema=None) df_c = utils.create_dataframe([Row(column1=5, column2=6)], schema=None) # Save DataFrames in respective directories utils.save_parquet(df_a, os.path.join(self.path_, "a", "df_a.parquet")) utils.save_parquet(df_b, os.path.join(self.path_, "b", "df_b.parquet")) utils.save_parquet(df_c, os.path.join(self.path_, "df_c.parquet")) utils.copy(self.path_, self.temp_path_, overwrite=True) # Read copied DataFrame cp_df_a = utils.read_files_from_HDFS(os.path.join(self.temp_path_, "a", "df_a.parquet")) cp_df_b = utils.read_files_from_HDFS(os.path.join(self.temp_path_, "b", "df_b.parquet")) cp_df_c = utils.read_files_from_HDFS(os.path.join(self.temp_path_, "df_c.parquet")) # Check if both DataFrames are same self.assertListEqual(df_a.rdd.map(list).collect(), cp_df_a.rdd.map(list).collect()) self.assertListEqual(df_b.rdd.map(list).collect(), cp_df_b.rdd.map(list).collect()) self.assertListEqual(df_c.rdd.map(list).collect(), cp_df_c.rdd.map(list).collect())
def save_candidate_sets(top_artists_candidate_set_df, similar_artists_candidate_set_df): """ Save candidate sets to HDFS. Args: top_artists_candidate_set_df (dataframe): Dataframe consisting of recording ids of top artists listened to by a user for all the users for whom recommendations shall be generated. Dataframe columns can be depicted as: [ 'user_id', 'recording_id' ] similar_artists_candidate_set_df (dataframe): Dataframe consisting of recording ids of artists similar to top artists listened to by a user for all the users for whom recommendations shall be generated. Columns can be depicted as: [ 'user_id', 'recording_id' ] """ utils.save_parquet(top_artists_candidate_set_df, path.TOP_ARTIST_CANDIDATE_SET) utils.save_parquet(similar_artists_candidate_set_df, path.SIMILAR_ARTIST_CANDIDATE_SET)
def test_process_json_listens_append(self, mock_read_json): fakeschema = StructType([StructField('column_1', StringType()), StructField('column_2', StringType())]) # Save old dataframe in HDFS old_df = utils.create_dataframe(Row(column_1='row_a', column_2='row_a'), fakeschema) old_df.union(utils.create_dataframe(Row(column_1='row_b', column_2='row_b'), fakeschema)) utils.save_parquet(old_df, os.path.join(self.path_, '/2020/1.parquet')) # Mock read_json to return new dataframe new_df = utils.create_dataframe(Row(column_1='row_c', column_2='row_c'), fakeschema) mock_read_json.return_value = new_df ListenbrainzDataUploader().process_json_listens('/2020/1.json', self.path_, self.path_, append=True, schema=fakeschema) received = utils.read_files_from_HDFS(os.path.join(self.path_, '/2020/1.parquet')) \ .rdd \ .map(list) \ .collect() old_df.union(new_df) expected = old_df.rdd.map(list).collect() self.assertCountEqual(received, expected)