def test_get_entity_all_time(self, mock_create_message, mock_get_listens, mock_get_latest_listen_ts): mock_df = MagicMock() mock_get_listens.return_value = mock_df entity_stats.get_entity_all_time('test', False) from_date = datetime(LAST_FM_FOUNDING_YEAR, 1, 1) to_date = datetime(2020, 8, 21) month = from_date time_range = [[ str(year), int(datetime(year, 1, 1).timestamp()), int(get_year_end(year).timestamp()) ] for year in range(from_date.year, to_date.year + 1)] time_range_df = run_query("SELECT * FROM time_range") time_range_result = time_range_df.rdd.map(list).collect() self.assertListEqual(time_range_result, time_range) mock_get_latest_listen_ts.assert_called_once() mock_get_listens.assert_called_with(from_date, to_date, LISTENBRAINZ_DATA_DIRECTORY) mock_df.createOrReplaceTempView.assert_called_with( 'sitewide_test_all_time') mock_create_message.assert_called_with( data='sitewide_test_all_time_data', entity='test', stats_range='all_time', from_ts=from_date.timestamp(), to_ts=to_date.timestamp())
def test_get_listening_activity_year(self, mock_create_messages, mock_get_listening_activity, mock_get_listens, mock_get_latest_listen_ts): mock_df = MagicMock() mock_get_listens.return_value = mock_df listening_activity_stats.get_listening_activity_year() to_date = datetime(2020, 6, 19) from_date = month = datetime(2019, 1, 1) time_range = [] while month < to_date: time_range.append( [month.strftime('%B %Y'), month, get_month_end(month)]) month = offset_months(month, 1, shift_backwards=False) time_range_df = run_query("SELECT * FROM time_range") time_range_result = time_range_df.rdd.map(list).collect() self.assertListEqual(time_range_result, time_range) mock_get_latest_listen_ts.assert_called_once() mock_get_listens.assert_called_with(from_date, to_date, LISTENBRAINZ_DATA_DIRECTORY) mock_df.createOrReplaceTempView.assert_called_with('listens') mock_create_messages.assert_called_with( data='listening_activity_table', stats_range='year', from_ts=from_date.timestamp(), to_ts=to_date.timestamp())
def get_top_artists(user_name): """ Prepare dataframe of top y (limit) artists listened to by the user where y is a config value. Args: user_name (str): User name of the user. Returns: top_artists_df (dataframe): Columns can be depicted as: [ 'user_name', 'artist_name', 'artist_msid', 'count' ] """ top_artists_df = run_query(""" SELECT user_name , artist_name , artist_msid , count(artist_msid) as count FROM listens_df GROUP BY user_name, artist_name, artist_msid HAVING user_name = "%s" ORDER BY count DESC LIMIT %s """ % (user_name, config.TOP_ARTISTS_LIMIT)) return top_artists_df
def test_get_listening_activity_week(self, mock_create_messages, mock_get_listening_activity, mock_get_listens, mock_get_latest_listen_ts): mock_df = MagicMock() mock_get_listens.return_value = mock_df listening_activity_stats.get_listening_activity_week() to_date = datetime(2020, 9, 10) from_date = day = datetime(2020, 8, 31) time_range = [] while day < to_date: time_range.append( [day.strftime('%A %d %B %Y'), day, get_day_end(day)]) day = offset_days(day, 1, shift_backwards=False) time_range_df = run_query("SELECT * FROM time_range") time_range_result = time_range_df.rdd.map(list).collect() self.assertListEqual(time_range_result, time_range) mock_get_latest_listen_ts.assert_called_once() mock_get_listens.assert_called_with(from_date, to_date, LISTENBRAINZ_DATA_DIRECTORY) mock_df.createOrReplaceTempView.assert_called_with('listens') mock_create_messages.assert_called_with( data='listening_activity_table', stats_range='week', from_ts=from_date.timestamp(), to_ts=to_date.timestamp())
def get_day_of_week(year): setup_listens_for_year(year) query = """ WITH listen_weekday AS ( SELECT user_id , date_format(listened_at, 'EEEE') AS weekday , count(*) AS listen_count FROM listens_of_year GROUP BY user_id , weekday ), top_listen_weekday AS ( SELECT user_id , weekday , listen_count , row_number() OVER(PARTITION BY user_id ORDER BY listen_count DESC) AS row_number FROM listen_weekday ) SELECT to_json( map_from_entries( collect_list( struct(user_id, weekday) ) ) ) AS all_users_weekday FROM top_listen_weekday WHERE row_number = 1 """ data = run_query(query).collect() yield {"type": "day_of_week", "data": data[0]["all_users_weekday"]}
def get_playcounts_data(listens_df, users_df, recordings_df): """ Prepare playcounts dataframe by joining listens_df, users_df and recordings_df to select distinct tracks that a user has listened to for all the users along with listen count. Args: listens_df: Listens dataframe. users_df: Users dataframe. recordings_df: Recordings dataframe. Returns: playcounts_df: playcounts dataframe with columns as: ['user_id', 'recording_id', 'count'] """ listens_df.createOrReplaceTempView('listen') users_df.createOrReplaceTempView('user') recordings_df.createOrReplaceTempView('recording') playcounts_df = run_query(""" SELECT user_id, recording_id, count(recording_id) as count FROM listen INNER JOIN user ON listen.user_name = user.user_name INNER JOIN recording ON recording.recording_msid = listen.recording_msid GROUP BY user_id, recording_id ORDER BY user_id """) return playcounts_df
def _create_mapped_dataframe(): """ Use MSID-MBID mapping to improve the data accuracy and quality Returns: mapped_df (dataframe): A DataFrame with mapped data """ # Read the mapped data into dataframe with the needed columns mapping_df = read_files_from_HDFS(MBID_MSID_MAPPING).select( 'mb_artist_credit_name', 'mb_artist_credit_mbids', 'msb_artist_msid') mapping_df.createOrReplaceTempView('mapping') mapped_df = run_query(""" SELECT CASE WHEN isnull(mb_artist_credit_name) THEN artist_name ELSE mb_artist_credit_name END as artist_name , CASE WHEN isnull(mb_artist_credit_mbids) THEN artist_mbids ELSE mb_artist_credit_mbids END as artist_mbids , CASE WHEN isnull(mb_artist_credit_mbids) AND cardinality(artist_mbids) == 0 THEN nullif(artist_msid, "") ELSE NULL END as artist_msid , listened_at FROM listens LEFT JOIN mapping ON listens.artist_msid == mapping.msb_artist_msid """) return mapped_df
def test_get_daily_activity(self): data = self._create_listens_table() expected = self._calculate_expected(data) received = {} result = daily_activity_stats.get_daily_activity() for entry in result: _dict = entry.asDict(recursive=True) received[_dict['user_name']] = _dict['daily_activity'] weekdays = [calendar.day_name[day] for day in range(0, 7)] hours = [hour for hour in range(0, 24)] time_range = itertools.product(weekdays, hours) time_range_df_expected = listenbrainz_spark.session.createDataFrame( time_range, schema=["day", "hour"]).toLocalIterator() time_range_expected = [ entry.asDict(recursive=True) for entry in time_range_df_expected ] time_range_df_received = run_query( "SELECT * FROM time_range").toLocalIterator() time_range_received = [ entry.asDict(recursive=True) for entry in time_range_df_received ] self.assertListEqual(time_range_expected, time_range_received) self.assertDictEqual(expected, received)
def get_artists(table: str, limit: int = SITEWIDE_STATS_ENTITY_LIMIT): """ Get artist information (artist_name, artist_msid etc) for every time range specified the "time_range" table ordered by listen count Args: table: Name of the temporary table. limit: number of top artists to retain Returns: iterator (iter): An iterator over result """ result = run_query(f""" WITH intermediate_table as ( SELECT artist_name , artist_credit_mbids , count(*) as listen_count FROM {table} GROUP BY artist_name , artist_credit_mbids ORDER BY listen_count DESC LIMIT {limit} ) SELECT collect_list( struct( artist_name , coalesce(artist_credit_mbids, array()) AS artist_mbids , listen_count ) ) AS stats FROM intermediate_table """) return result.toLocalIterator()
def test_get_entity_week(self, mock_create_message, mock_filter_listens, mock_get_listens, mock_get_latest_listen_ts): mock_df = MagicMock() mock_get_listens.return_value = mock_df mock_filtered_df = MagicMock() mock_filter_listens.return_value = mock_filtered_df entity_stats.get_entity_week('test', False) from_date = datetime(2020, 8, 3) to_date = datetime(2020, 8, 17) day = from_date time_range = [] while day < to_date: time_range.append([day.strftime('%A %d %B %Y'), int(day.timestamp()), int(get_day_end(day).timestamp())]) day = offset_days(day, 1, shift_backwards=False) time_range_df = run_query("SELECT * FROM time_range") time_range_result = time_range_df.rdd.map(list).collect() self.assertListEqual(time_range_result, time_range) mock_get_latest_listen_ts.assert_called_once() mock_get_listens.assert_called_with(from_date, to_date, LISTENBRAINZ_DATA_DIRECTORY) mock_filter_listens.assert_called_with(mock_df, from_date, to_date) mock_filtered_df.createOrReplaceTempView.assert_called_with('sitewide_test_week') mock_create_message.assert_called_with(data='sitewide_test_week_data', entity='test', stats_range='week', from_ts=from_date.timestamp(), to_ts=to_date.timestamp())
def get_listen_count(year): setup_listens_for_year(year) data = run_query(_get_yearly_listen_counts()).collect() yield { "type": "year_in_music_listen_count", "data": data[0]["yearly_listen_counts"] }
def prepare_recording_data(table): """ Prepare recordings dataframe to select distinct recordings/tracks listened to and assign each recording a unique integer id. Args: table (str): Registered dataframe to run SQL queries. Returns: recordings_df (dataframe): Columns can be depicted as: [ 'track_name', 'recording_msid', 'artist_name', 'artist_msid', 'release_name', 'release_msid', 'recording_id' ] """ recordings_df = run_query(""" SELECT track_name , recording_msid , artist_name , artist_msid , release_name , release_msid , row_number() OVER (ORDER BY 'recording_msid') AS recording_id FROM (SELECT DISTINCT recording_msid, track_name, artist_name, artist_msid, release_name, release_msid FROM %s) """ % (table)) return recordings_df
def test_run_query(self): df = utils.create_dataframe([Row(column1=1, column2=2)], schema=None) utils.register_dataframe(df, "table") new_df = stats.run_query(""" SELECT * FROM table """) self.assertEqual(new_df.count(), df.count())
def get_most_prominent_color(year): setup_listens_for_year(year) _get_release_colors().createOrReplaceTempView("release_color") all_user_colors = run_query(_get_most_prominent_color()).collect() yield { "type": "most_prominent_color", "data": all_user_colors[0]["all_users_colors"] }
def get_most_listened_year(year): setup_listens_for_year(year) setup_all_releases() data = run_query(_get_releases_with_date()).collect() yield { "type": "most_listened_year", "data": data[0]["all_user_yearly_counts"] }
def get_releases(table): """ Get release information (release_name, release_mbid etc) for every user ordered by listen count (number of times a user has listened to tracks which belong to a particular release). Args: table: name of the temporary table Returns: iterator (iter): an iterator over result { 'user1' : [{ 'release_name': str 'release_msid': str, 'release_mbid': str, 'artist_name': str, 'artist_msid': str, 'artist_mbids': list(str), 'listen_count': int }], 'user2' : [{...}], } """ result = run_query(f""" WITH intermediate_table as ( SELECT user_id , first(release_name) AS any_release_name , release_mbid , first(artist_name) AS any_artist_name , artist_credit_mbids , count(*) as listen_count FROM {table} WHERE release_name != '' GROUP BY user_id , lower(release_name) , release_mbid , lower(artist_name) , artist_credit_mbids ) SELECT user_id , sort_array( collect_list( struct( listen_count , any_release_name AS release_name , release_mbid , any_artist_name AS artist_name , coalesce(artist_credit_mbids, array()) AS artist_mbids ) ) , false ) as releases FROM intermediate_table GROUP BY user_id """) return result.toLocalIterator()
def get_releases(table: str, limit: int = SITEWIDE_STATS_ENTITY_LIMIT): """ Get release information (release_name, release_mbid etc) ordered by listen count (number of times listened to tracks which belong to a particular release). Args: table: name of the temporary table limit: number of top releases to retain Returns: iterator: an iterator over result, contains only 1 row { [ { 'release_name': str 'release_msid': str, 'release_mbid': str, 'artist_name': str, 'artist_msid': str, 'artist_mbids': list(str), 'listen_count': int }, ... ], } """ result = run_query(f""" WITH intermediate_table as ( SELECT first(release_name) AS any_release_name , release_mbid , first(artist_name) AS any_artist_name , artist_credit_mbids , count(*) as listen_count FROM {table} WHERE release_name != '' GROUP BY lower(release_name) , release_mbid , lower(artist_name) , artist_credit_mbids ORDER BY listen_count DESC LIMIT {limit} ) SELECT sort_array( collect_list( struct( listen_count , any_release_name AS release_name , release_mbid , any_artist_name AS artist_name , coalesce(artist_credit_mbids, array()) AS artist_mbids ) ) , false ) as stats FROM intermediate_table """) return result.toLocalIterator()
def calculate_listening_activity(): """ Calculate number of listens for each user in time ranges given in the "time_range" table. The time ranges are as follows: 1) week - each day with weekday name of the past 2 weeks. 2) month - each day the past 2 months. 3) year - each month of the past 2 years. 4) all_time - each year starting from LAST_FM_FOUNDING_YEAR (2002) """ # Calculate the number of listens in each time range for each user except the time ranges which have zero listens. result_without_zero_days = run_query(""" SELECT listens.user_name , time_range.time_range , count(listens.user_name) as listen_count FROM listens JOIN time_range ON listens.listened_at >= time_range.start AND listens.listened_at <= time_range.end GROUP BY listens.user_name , time_range.time_range """) result_without_zero_days.createOrReplaceTempView( "result_without_zero_days") # Add the time ranges which have zero listens to the previous dataframe result = run_query(""" SELECT dist_user_name.user_name , time_range.time_range , to_unix_timestamp(time_range.start) as from_ts , to_unix_timestamp(time_range.end) as to_ts , ifnull(result_without_zero_days.listen_count, 0) as listen_count FROM (SELECT DISTINCT user_name FROM listens) dist_user_name CROSS JOIN time_range LEFT JOIN result_without_zero_days ON result_without_zero_days.user_name = dist_user_name.user_name AND result_without_zero_days.time_range = time_range.time_range """) # Create a table with a list of time ranges and corresponding listen count for each user iterator = result \ .withColumn("listening_activity", struct("from_ts", "to_ts", "listen_count", "time_range")) \ .groupBy("user_name") \ .agg(sort_array(collect_list("listening_activity")).alias("listening_activity")) \ .toLocalIterator() return iterator
def calculate_daily_activity(): """ Calculate number of listens for each user in each hour. """ # Genarate a dataframe containing hours of all days of the week weekdays = [calendar.day_name[day] for day in range(0, 7)] hours = [hour for hour in range(0, 24)] time_range = itertools.product(weekdays, hours) time_range_df = listenbrainz_spark.session.createDataFrame( time_range, schema=["day", "hour"]) time_range_df.createOrReplaceTempView("time_range") # Truncate listened_at to day and hour to improve matching speed formatted_listens = run_query(""" SELECT user_id , date_format(listened_at, 'EEEE') as day , date_format(listened_at, 'H') as hour FROM listens """) formatted_listens.createOrReplaceTempView("listens") # Calculate the number of listens in each time range for each user except the time ranges which have zero listens. result = run_query(""" SELECT listens.user_id , time_range.day , time_range.hour , count(*) as listen_count FROM listens JOIN time_range ON listens.day == time_range.day AND listens.hour == time_range.hour GROUP BY listens.user_id , time_range.day , time_range.hour """) # Create a table with a list of time ranges and corresponding listen count for each user iterator = result \ .withColumn("daily_activity", struct("hour", "day", "listen_count")) \ .groupBy("user_id") \ .agg(sort_array(collect_list("daily_activity")).alias("daily_activity")) \ .toLocalIterator() return iterator
def get_releases(table): """ Get release information (release_name, release_mbid etc) for every user ordered by listen count (number of times a user has listened to tracks which belong to a particular release). Args: table: name of the temporary table Returns: artists: A dict of dicts which can be depicted as: { 'user1' : [{ 'release_name': str 'release_msid': str, 'release_mbid': str, 'artist_name': str, 'artist_msid': str, 'artist_mbids': str, 'listen_count': int }], 'user2' : [{...}], } """ t0 = time.time() query = run_query(""" SELECT user_name , release_name , release_msid , release_mbid , artist_name , artist_msid , artist_mbids , count(release_msid) as cnt FROM %s GROUP BY user_name, release_name, release_msid, release_mbid, artist_name, artist_msid, artist_mbids ORDER BY cnt DESC """ % (table)) rows = query.collect() releases = defaultdict(list) for row in rows: releases[row.user_name].append({ 'release_name': row.release_name, 'release_msid': row.release_msid, 'release_mbid': row.release_mbid, 'artist_name': row.artist_name, 'artist_msid': row.artist_msid, 'artist_mbids': row.artist_mbids, 'listen_count': row.cnt, }) print("Query to calculate release stats processed in %.2f s" % (time.time() - t0)) return releases
def get_recordings(table: str, limit: int = SITEWIDE_STATS_ENTITY_LIMIT): """ Get recordings information (artist_name, artist_msid etc) for every time range specified ordered by listen count. Args: table: Name of the temporary table. limit: number of top artists to retain Returns: iterator (iter): An iterator over result """ # we sort twice, the ORDER BY in CTE sorts to eliminate all # but top LIMIT results. collect_list's docs mention that the # order of collected results is not guaranteed so sort again # with sort_array. result = run_query(f""" WITH intermediate_table as ( SELECT first(recording_name) AS any_recording_name , recording_mbid , first(artist_name) AS any_artist_name , artist_credit_mbids , nullif(first(release_name), '') as any_release_name , release_mbid , count(*) as listen_count FROM {table} GROUP BY lower(recording_name) , recording_mbid , lower(artist_name) , artist_credit_mbids , lower(release_name) , release_mbid ORDER BY listen_count DESC LIMIT {limit} ) SELECT sort_array( collect_list( struct( listen_count , any_recording_name AS track_name , recording_mbid , any_artist_name AS artist_name , coalesce(artist_credit_mbids, array()) AS artist_mbids , any_release_name AS release_name , release_mbid ) ) , false ) as stats FROM intermediate_table """) return result.toLocalIterator()
def get_listening_activity(): """ Calculate number of listens for each user in time ranges given in the 'time_range' table """ # Calculate the number of listens in each time range for each user except the time ranges which have zero listens. result_without_zero_days = run_query(""" SELECT listens.user_name , time_range.time_range , count(listens.user_name) as listen_count FROM listens JOIN time_range ON listens.listened_at >= time_range.start AND listens.listened_at <= time_range.end GROUP BY listens.user_name , time_range.time_range """) result_without_zero_days.createOrReplaceTempView('result_without_zero_days') # Add the time ranges which have zero listens to the previous dataframe result = run_query(""" SELECT dist_user_name.user_name , time_range.time_range , to_unix_timestamp(time_range.start) as from_ts , to_unix_timestamp(time_range.end) as to_ts , ifnull(result_without_zero_days.listen_count, 0) as listen_count FROM (SELECT DISTINCT user_name FROM listens) dist_user_name CROSS JOIN time_range LEFT JOIN result_without_zero_days ON result_without_zero_days.user_name = dist_user_name.user_name AND result_without_zero_days.time_range = time_range.time_range """) # Create a table with a list of time ranges and corresponding listen count for each user iterator = result \ .withColumn("listening_activity", struct("from_ts", "to_ts", "listen_count", "time_range")) \ .groupBy("user_name") \ .agg(sort_array(collect_list("listening_activity")).alias("listening_activity")) \ .toLocalIterator() return iterator
def get_top_artists_with_collab(): """ Prepare dataframe consisting of top artists with non zero collaborations. Returns: top_artists_with_collab_df (dataframe): Column can be depicted as: [ 'artist_name' ] """ top_artists_with_collab_df = run_query(""" SELECT DISTINCT artist_name FROM similar_artist """) return top_artists_with_collab_df
def test_get_daily_activity(self): received = daily_activity.get_daily_activity('all_time') time_range_expected = itertools.product(calendar.day_name, range(0, 24)) time_range_received = run_query( "SELECT * FROM time_range").toLocalIterator() self.assertListEqual(list(time_range_expected), list(time_range_received)) with open(self.path_to_data_file('user_daily_activity.json')) as f: expected = json.load(f) self.assertListEqual(expected, list(received))
def get_artists(table: str, user_listen_count_limit, top_artists_limit: int = SITEWIDE_STATS_ENTITY_LIMIT): """ Get artist information (artist_name, artist_msid etc) for every time range specified the "time_range" table ordered by listen count Args: table: name of the temporary table user_listen_count_limit: per user per entity listen count above which it should be capped top_artists_limit: number of top artists to retain Returns: iterator (iter): An iterator over result """ # we sort twice, the ORDER BY in CTE sorts to eliminate all # but top LIMIT results. collect_list's docs mention that the # order of collected results is not guaranteed so sort again # with sort_array. result = run_query(f""" WITH user_counts as ( SELECT user_id , first(artist_name) AS artist_name , artist_credit_mbids , LEAST(count(*), {user_listen_count_limit}) as listen_count FROM {table} GROUP BY user_id , lower(artist_name) , artist_credit_mbids ), intermediate_table AS ( SELECT first(artist_name) AS artist_name , artist_credit_mbids , SUM(listen_count) as total_listen_count FROM user_counts GROUP BY lower(artist_name) , artist_credit_mbids ORDER BY total_listen_count DESC LIMIT {top_artists_limit} ) SELECT sort_array( collect_list( struct( total_listen_count AS listen_count , artist_name , coalesce(artist_credit_mbids, array()) AS artist_mbids ) ) , false ) AS stats FROM intermediate_table """) return result.toLocalIterator()
def get_user_id(user_name): """ Get user id using user name. Args: user_name: Name of the user. Returns: user_id: User id of the user. """ result = run_query(""" SELECT user_id FROM user WHERE user_name = '%s' """ % user_name) return result.first()['user_id']
def get_latest_listen_ts(): """ Get the timestamp of the latest timestamp present in spark cluster """ now = datetime.now() while True: try: df = utils.get_listens(now, now, LISTENBRAINZ_DATA_DIRECTORY) break except HDFSException: now = offset_months(now, 1) df.createOrReplaceTempView('latest_listen_ts') result = run_query( "SELECT MAX(listened_at) as max_timestamp FROM latest_listen_ts") rows = result.collect() return rows[0]['max_timestamp']
def get_artists(table: str) -> Iterator[UserArtistRecord]: """ Get artist information (artist_name, artist_credit_id etc) for every user ordered by listen count Args: table: name of the temporary table. Returns: iterator (iter): an iterator over result { user1: [ { 'artist_name': str, 'artist_credit_id': int, 'listen_count': int } ], user2: [{...}], } """ result = run_query(f""" WITH intermediate_table as ( SELECT user_name , first(artist_name) AS any_artist_name , artist_credit_mbids , count(*) as listen_count FROM {table} GROUP BY user_name , lower(artist_name) , artist_credit_mbids ) SELECT user_name , sort_array( collect_list( struct( listen_count , any_artist_name AS artist_name , coalesce(artist_credit_mbids, array()) AS artist_mbids ) ) , false ) as artists FROM intermediate_table GROUP BY user_name """) return result.toLocalIterator()
def test_get_daily_activity(self): received = list(daily_activity.get_daily_activity('all_time')) time_range_expected = itertools.product(calendar.day_name, range(0, 24)) time_range_received = run_query("SELECT * FROM time_range").toLocalIterator() self.assertListEqual(list(time_range_expected), list(time_range_received)) with open(self.path_to_data_file('user_daily_activity.json')) as f: expected = json.load(f) self.assertEqual(len(received), len(expected)) self.assertEqual(received[0]["type"], expected[0]["type"]) self.assertEqual(received[0]["stats_range"], expected[0]["stats_range"]) self.assertEqual(received[0]["from_ts"], expected[0]["from_ts"]) self.assertEqual(received[0]["to_ts"], expected[0]["to_ts"]) self.assertCountEqual(received[0]["data"], expected[0]["data"])
def get_artists(table): """ Get artist information (artist_name, artist_msid etc) for every user ordered by listen count Args: table (str): name of the temporary table. Returns: iterator (iter): an iterator over result { user1: [{ 'artist_name': str, 'artist_msid': str, 'artist_mbids': list(str), 'listen_count': int }], user2: [{...}], } """ result = run_query(""" WITH intermediate_table as ( SELECT user_name , artist_name , CASE WHEN cardinality(artist_mbids) > 0 THEN NULL ELSE nullif(artist_msid, '') END as artist_msid , artist_mbids FROM {table} ) SELECT * , count(*) as listen_count FROM intermediate_table GROUP BY user_name , artist_name , artist_msid , artist_mbids """.format(table=table)) iterator = result \ .withColumn("artists", struct("listen_count", "artist_name", "artist_msid", "artist_mbids")) \ .groupBy("user_name") \ .agg(sort_array(collect_list("artists"), asc=False).alias("artists")) \ .toLocalIterator() return iterator