def get_entity_all_time( entity: str, use_mapping: bool = False ) -> Optional[List[SitewideEntityStatMessage]]: """ Get the all_time sitewide top entity """ logger.debug("Calculating sitewide_{}_all_time...".format(entity)) to_date = get_latest_listen_ts() from_date = datetime(LAST_FM_FOUNDING_YEAR, 1, 1) # Generate a dataframe containing years from "from_date" to "to_date" time_range = [[ str(year), int(datetime(year, 1, 1).timestamp()), int(get_year_end(year).timestamp()) ] for year in range(from_date.year, to_date.year + 1)] time_range_df = listenbrainz_spark.session.createDataFrame( time_range, schema=time_range_schema) time_range_df.createOrReplaceTempView('time_range') listens_df = get_listens(from_date, to_date, LISTENBRAINZ_DATA_DIRECTORY) table_name = 'sitewide_{}_all_time'.format(entity) listens_df.createOrReplaceTempView(table_name) handler = entity_handler_map[entity] data = handler(table_name, "yyyy", use_mapping) message = create_message(data=data, entity=entity, stats_range='all_time', from_ts=from_date.timestamp(), to_ts=to_date.timestamp()) logger.debug("Done!") return message
def test_get_mapped_artist_and_recording_mbids(self): to_date = get_latest_listen_ts() partial_listen_df = dataframe_utils.get_listens_for_training_model_window(to_date, to_date, self.listens_path) df = utils.read_files_from_HDFS(self.mapping_path) mapping_df = mapping_utils.get_unique_rows_from_mapping(df) mapped_listens_path = '/mapped_listens.parquet' mapped_listens = dataframe_utils.get_mapped_artist_and_recording_mbids(partial_listen_df, mapping_df, mapped_listens_path) self.assertEqual(mapped_listens.count(), 8) cols = [ 'listened_at', 'mb_artist_credit_id', 'mb_artist_credit_mbids', 'mb_recording_mbid', 'mb_release_mbid', 'msb_artist_credit_name_matchable', 'msb_recording_name_matchable', 'user_name' ] self.assertListEqual(sorted(cols), sorted(mapped_listens.columns)) status = utils.path_exists(mapped_listens_path) self.assertTrue(status)
def get_listening_activity_year() -> Iterator[Optional[UserListeningActivityStatMessage]]: """ Calculate the number of listens for an user in each month of the past and current year. """ logger.debug("Calculating listening_activity_year") to_date = get_latest_listen_ts() from_date = datetime(to_date.year-1, 1, 1) month = datetime(to_date.year-1, 1, 1) time_range = [] # Genarate a dataframe containing months of last and current year along with start and end time while month < to_date: time_range.append([month.strftime('%B %Y'), month, get_month_end(month)]) month = offset_months(month, 1, shift_backwards=False) time_range_df = listenbrainz_spark.session.createDataFrame(time_range, time_range_schema) time_range_df.createOrReplaceTempView('time_range') _get_listens(from_date, to_date) data = get_listening_activity() messages = create_messages(data=data, stats_range='year', from_ts=from_date.timestamp(), to_ts=to_date.timestamp()) logger.debug("Done!") return messages
def get_listening_activity_month() -> Iterator[Optional[UserListeningActivityStatMessage]]: """ Calculate number of listens for an user on each day of the past month and current month. """ logger.debug("Calculating listening_activity_month") to_date = get_latest_listen_ts() from_date = offset_months(replace_days(to_date, 1), 1) # Set time to 00:00 from_date = datetime(from_date.year, from_date.month, from_date.day) day = from_date # Genarate a dataframe containing days of last and current month along with start and end time time_range = [] while day < to_date: time_range.append([day.strftime('%d %B %Y'), day, get_day_end(day)]) day = offset_days(day, 1, shift_backwards=False) time_range_df = listenbrainz_spark.session.createDataFrame(time_range, time_range_schema) time_range_df.createOrReplaceTempView('time_range') _get_listens(from_date, to_date) data = get_listening_activity() messages = create_messages(data=data, stats_range='month', from_ts=from_date.timestamp(), to_ts=to_date.timestamp()) logger.debug("Done!") return messages
def test_get_listens_for_training_model_window(self): to_date = get_latest_listen_ts() from_date = stats.offset_days(to_date, 2) print(to_date, from_date) test_df = dataframe_utils.get_listens_for_training_model_window(to_date, from_date, self.listens_path) self.assertIn('artist_name_matchable', test_df.columns) self.assertIn('track_name_matchable', test_df.columns) self.assertEqual(test_df.count(), 11)
def test_get_latest_listen_ts(self): date = datetime(2020, 5, 18) df = utils.create_dataframe(Row(listened_at=date), schema=None) df = df.union( utils.create_dataframe(Row(listened_at=offset_days(date, 7)), schema=None)) utils.save_parquet(df, '{}/2020/5.parquet'.format(self.path_)) result = stats_utils.get_latest_listen_ts() self.assertEqual(date, result)
def test_get_listens_for_training_model_window(self): metadata = {} to_date = get_latest_listen_ts() from_date = stats.offset_days(to_date, 2) test_df = create_dataframes.get_listens_for_training_model_window( to_date, from_date, metadata, self.listens_path) self.assertEqual(metadata['to_date'], to_date) self.assertEqual(metadata['from_date'], from_date) self.assertNotIn('artist_mbids', test_df.columns) self.assertNotIn('recording_mbid', test_df.columns)
def get_daily_activity_all_time() -> Iterator[Optional[UserDailyActivityStatMessage]]: """ Calculate number of listens for an user per hour on each day of week. """ logger.debug("Calculating daily_activity_all_time") to_date = get_latest_listen_ts() from_date = datetime(LAST_FM_FOUNDING_YEAR, 1, 1) _get_listens(from_date, to_date) data = get_daily_activity() messages = create_messages(data=data, stats_range='all_time', from_ts=from_date.timestamp(), to_ts=to_date.timestamp()) logger.debug("Done!") return messages
def get_dates_to_train_data(train_model_window): """ Get window to fetch listens to train data. Args: train_model_window (int): model to be trained on data of given number of days. Returns: from_date (datetime): Date from which start fetching listens. to_date (datetime): Date upto which fetch listens. """ to_date = get_latest_listen_ts() from_date = offset_days(to_date, train_model_window) # shift to the first of the month from_date = replace_days(from_date, 1) return to_date, from_date
def get_daily_activity_year() -> Iterator[Optional[UserDailyActivityStatMessage]]: """ Calculate number of listens for an user per hour on each day of week of the current year. """ logger.debug("Calculating daily_activity_year") to_date = get_latest_listen_ts() from_date = datetime(to_date.year, 1, 1) _get_listens(from_date, to_date) data = get_daily_activity() messages = create_messages(data=data, stats_range='year', from_ts=from_date.timestamp(), to_ts=to_date.timestamp()) logger.debug("Done!") return messages
def get_daily_activity_month() -> Iterator[Optional[UserDailyActivityStatMessage]]: """ Calculate number of listens for an user per hour on each day of week of the current month. """ logger.debug("Calculating daily_activity_month") to_date = get_latest_listen_ts() from_date = replace_days(to_date, 1) # Set time to 00:00 from_date = datetime(from_date.year, from_date.month, from_date.day) _get_listens(from_date, to_date) data = get_daily_activity() messages = create_messages(data=data, stats_range='month', from_ts=from_date.timestamp(), to_ts=to_date.timestamp()) logger.debug("Done!") return messages
def get_daily_activity_week() -> Iterator[Optional[UserDailyActivityStatMessage]]: """ Calculate number of listens for an user per hour on each day of the past week. """ logger.debug("Calculating daily_activity_week") date = get_latest_listen_ts() to_date = get_last_monday(date) from_date = offset_days(to_date, 7) _get_listens(from_date, to_date) data = get_daily_activity() messages = create_messages(data=data, stats_range='week', from_ts=from_date.timestamp(), to_ts=to_date.timestamp()) logger.debug("Done!") return messages
def get_entity_week( entity: str, use_mapping: bool = False ) -> Optional[List[SitewideEntityStatMessage]]: """ Get the weekly sitewide top entity """ current_app.logger.debug("Calculating sitewide_{}_week...".format(entity)) date = get_latest_listen_ts() to_date = get_last_monday(date) # Set time to 00:00 to_date = datetime(to_date.year, to_date.month, to_date.day) from_date = offset_days(to_date, 14) day = from_date # Genarate a dataframe containing days of last and current week along with start and end time time_range = [] while day < to_date: time_range.append([ day.strftime('%A %d %B %Y'), int(day.timestamp()), int(get_day_end(day).timestamp()) ]) day = offset_days(day, 1, shift_backwards=False) time_range_df = listenbrainz_spark.session.createDataFrame( time_range, schema=time_range_schema) time_range_df.createOrReplaceTempView('time_range') listens_df = get_listens(from_date, to_date, LISTENBRAINZ_DATA_DIRECTORY) filtered_df = filter_listens(listens_df, from_date, to_date) table_name = 'sitewide_{}_week'.format(entity) filtered_df.createOrReplaceTempView(table_name) handler = entity_handler_map[entity] data = handler(table_name, "EEEE dd MMMM yyyy", use_mapping) message = create_message(data=data, entity=entity, stats_range='week', from_ts=from_date.timestamp(), to_ts=to_date.timestamp()) current_app.logger.debug("Done!") return message
def get_entity_year(entity: str) -> Iterator[Optional[UserEntityStatMessage]]: """ Get the year top entity for all users """ logger.debug("Calculating {}_year...".format(entity)) to_date = get_latest_listen_ts() from_date = replace_days(replace_months(to_date, 1), 1) listens_df = get_listens(from_date, to_date, LISTENBRAINZ_DATA_DIRECTORY) table_name = 'user_{}_year'.format(entity) listens_df.createOrReplaceTempView(table_name) handler = entity_handler_map[entity] data = handler(table_name) messages = create_messages(data=data, entity=entity, stats_range='year', from_ts=from_date.timestamp(), to_ts=to_date.timestamp()) logger.debug("Done!") return messages
def get_entity_all_time(entity: str) -> Iterator[Optional[UserEntityStatMessage]]: """ Get the all_time top entity for all users """ logger.debug("Calculating {}_all_time...".format(entity)) to_date = get_latest_listen_ts() from_date = datetime(LAST_FM_FOUNDING_YEAR, 1, 1) listens_df = get_listens(from_date, to_date, LISTENBRAINZ_DATA_DIRECTORY) table_name = 'user_{}_all_time'.format(entity) listens_df.createOrReplaceTempView(table_name) handler = entity_handler_map[entity] data = handler(table_name) messages = create_messages(data=data, entity=entity, stats_range='all_time', from_ts=from_date.timestamp(), to_ts=to_date.timestamp()) logger.debug("Done!") return messages
def get_entity_year( entity: str, use_mapping: bool = False ) -> Optional[List[SitewideEntityStatMessage]]: """ Get the yearly sitewide top entity """ current_app.logger.debug("Calculating sitewide_{}_year...".format(entity)) to_date = get_latest_listen_ts() from_date = datetime(to_date.year - 1, 1, 1) month = from_date time_range = [] # Genarate a dataframe containing months of last and current year along with start and end time while month < to_date: time_range.append([ month.strftime('%B %Y'), int(month.timestamp()), int(get_month_end(month).timestamp()) ]) month = offset_months(month, 1, shift_backwards=False) time_range_df = listenbrainz_spark.session.createDataFrame( time_range, schema=time_range_schema) time_range_df.createOrReplaceTempView('time_range') listens_df = get_listens(from_date, to_date, LISTENBRAINZ_DATA_DIRECTORY) table_name = 'sitewide_{}_year'.format(entity) listens_df.createOrReplaceTempView(table_name) handler = entity_handler_map[entity] data = handler(table_name, "MMMM yyyy", use_mapping) message = create_message(data=data, entity=entity, stats_range='year', from_ts=from_date.timestamp(), to_ts=to_date.timestamp()) current_app.logger.debug("Done!") return message
def get_entity_week(entity: str) -> Iterator[Optional[UserEntityStatMessage]]: """ Get the weekly top entity for all users """ logger.debug("Calculating {}_week...".format(entity)) date = get_latest_listen_ts() to_date = get_last_monday(date) from_date = offset_days(to_date, 7) listens_df = get_listens(from_date, to_date, LISTENBRAINZ_DATA_DIRECTORY) filtered_df = filter_listens(listens_df, from_date, to_date) table_name = 'user_{}_week'.format(entity) filtered_df.createOrReplaceTempView(table_name) handler = entity_handler_map[entity] data = handler(table_name) messages = create_messages(data=data, entity=entity, stats_range='week', from_ts=from_date.timestamp(), to_ts=to_date.timestamp()) logger.debug("Done!") return messages
def get_listening_activity_all_time() -> Iterator[Optional[UserListeningActivityStatMessage]]: """ Calculate the number of listens for an user in each year starting from LAST_FM_FOUNDING_YEAR (2002). """ logger.debug("Calculating listening_activity_all_time") to_date = get_latest_listen_ts() from_date = datetime(LAST_FM_FOUNDING_YEAR, 1, 1) result_without_zero_years = None for year in range(from_date.year, to_date.year+1): year_start = datetime(year, 1, 1) year_end = get_year_end(year) try: _get_listens(year_start, year_end) except HDFSException: # Skip if no listens present in df continue year_df = run_query(""" SELECT user_name, count(user_name) as listen_count FROM listens GROUP BY user_name """) year_df = year_df.withColumn('time_range', lit(str(year))).withColumn( 'from_ts', lit(year_start.timestamp())).withColumn('to_ts', lit(year_end.timestamp())) result_without_zero_years = result_without_zero_years.union(year_df) if result_without_zero_years else year_df # Create a table with a list of time ranges and corresponding listen count for each user data = result_without_zero_years \ .withColumn("listening_activity", struct("from_ts", "to_ts", "listen_count", "time_range")) \ .groupBy("user_name") \ .agg(sort_array(collect_list("listening_activity")).alias("listening_activity")) \ .toLocalIterator() messages = create_messages(data=data, stats_range='all_time', from_ts=from_date.timestamp(), to_ts=to_date.timestamp()) logger.debug("Done!") return messages