def get_listening_activity_month() -> Iterator[Optional[UserListeningActivityStatMessage]]: """ Calculate number of listens for an user on each day of the past month and current month. """ current_app.logger.debug("Calculating listening_activity_month") to_date = get_latest_listen_ts() # Set time to 00:00 to_date = datetime(to_date.year, to_date.month, to_date.day) from_date = offset_months(replace_days(to_date, 1), 1) day = offset_months(replace_days(to_date, 1), 1) # Genarate a dataframe containing days of last and current month along with start and end time time_range = [] while day < to_date: time_range.append([day.strftime('%d %B %Y'), day, get_day_end(day)]) day = offset_days(day, 1, shift_backwards=False) time_range_df = listenbrainz_spark.session.createDataFrame(time_range, time_range_schema) time_range_df.createOrReplaceTempView('time_range') _get_listens(from_date, to_date) data = get_listening_activity() messages = create_messages(data=data, stats_range='month', from_ts=from_date.timestamp(), to_ts=to_date.timestamp()) current_app.logger.debug("Done!") return messages
def test_offset_months(self): d1 = stats.offset_months(datetime.datetime(2019, 5, 12), 3, shift_backwards=False) d2 = datetime.datetime(2019, 8, 12) self.assertEqual(d1, d2) d1 = stats.offset_months(datetime.datetime(2019, 5, 12), 3) d2 = datetime.datetime(2019, 2, 12) self.assertEqual(d1, d2)
def get_listening_activity_year() -> Iterator[Optional[UserListeningActivityStatMessage]]: """ Calculate the number of listens for an user in each month of the past and current year. """ logger.debug("Calculating listening_activity_year") to_date = get_latest_listen_ts() from_date = datetime(to_date.year-1, 1, 1) month = datetime(to_date.year-1, 1, 1) time_range = [] # Genarate a dataframe containing months of last and current year along with start and end time while month < to_date: time_range.append([month.strftime('%B %Y'), month, get_month_end(month)]) month = offset_months(month, 1, shift_backwards=False) time_range_df = listenbrainz_spark.session.createDataFrame(time_range, time_range_schema) time_range_df.createOrReplaceTempView('time_range') _get_listens(from_date, to_date) data = get_listening_activity() messages = create_messages(data=data, stats_range='year', from_ts=from_date.timestamp(), to_ts=to_date.timestamp()) logger.debug("Done!") return messages
def test_get_listening_activity_year(self, mock_create_messages, mock_get_listening_activity, mock_get_listens, mock_get_latest_listen_ts): mock_df = MagicMock() mock_get_listens.return_value = mock_df listening_activity_stats.get_listening_activity_year() to_date = datetime(2020, 6, 19) from_date = month = datetime(2019, 1, 1) time_range = [] while month < to_date: time_range.append( [month.strftime('%B %Y'), month, get_month_end(month)]) month = offset_months(month, 1, shift_backwards=False) time_range_df = run_query("SELECT * FROM time_range") time_range_result = time_range_df.rdd.map(list).collect() self.assertListEqual(time_range_result, time_range) mock_get_latest_listen_ts.assert_called_once() mock_get_listens.assert_called_with(from_date, to_date, LISTENBRAINZ_DATA_DIRECTORY) mock_df.createOrReplaceTempView.assert_called_with('listens') mock_create_messages.assert_called_with( data='listening_activity_table', stats_range='year', from_ts=from_date.timestamp(), to_ts=to_date.timestamp())
def get_listens(from_date, to_date, dest_path): """ Prepare dataframe of months falling between from_date and to_date (both inclusive). Args: from_date (datetime): Date from which start fetching listens. to_date (datetime): Date upto which fetch listens. dest_path (str): HDFS path to fetch listens from. Returns: df: Dataframe of listens. """ if to_date < from_date: raise ValueError( '{}: Data generation window is negative i.e. from_date (date from which start fetching listens)' ' is greater than to_date (date upto which fetch listens).'.format( type(ValueError).__name__)) df = None while from_date <= to_date: try: month = read_files_from_HDFS('{}/{}/{}.parquet'.format( dest_path, from_date.year, from_date.month)) df = df.union(month) if df else month except PathNotFoundException as err: current_app.logger.debug( '{}\nFetching file for next date...'.format(err)) # go to the next month of from_date from_date = stats.offset_months(date=from_date, months=1, shift_backwards=False) # shift to the first of the month from_date = stats.replace_days(from_date, 1) if not df: current_app.logger.error('Listening history missing form HDFS') raise HDFSException("Listening history missing from HDFS") return df
def get_latest_listen_ts(): """ Get the timestamp of the latest timestamp present in spark cluster """ now = datetime.now() while True: try: df = utils.get_listens(now, now, LISTENBRAINZ_DATA_DIRECTORY) break except HDFSException: now = offset_months(now, 1) df.createOrReplaceTempView('latest_listen_ts') result = run_query( "SELECT MAX(listened_at) as max_timestamp FROM latest_listen_ts") rows = result.collect() return rows[0]['max_timestamp']
def test_filter_listens(self): from_date = datetime(2020, 5, 1) to_date = datetime(2020, 5, 31) df = utils.create_dataframe( Row(listened_at=offset_months(from_date, 1)), None) df = df.union( utils.create_dataframe( Row(listened_at=offset_months( to_date, 1, shift_backwards=False)), None)) df = df.union( utils.create_dataframe( Row(listened_at=offset_days( from_date, 5, shift_backwards=False)), None)) df = df.union( utils.create_dataframe(Row(listened_at=offset_days(to_date, 5)), None)) result = stats_utils.filter_listens(df, from_date, to_date) rows = result.collect() self.assertEqual(rows[0]['listened_at'], offset_days(from_date, 5, shift_backwards=False)) self.assertEqual(rows[1]['listened_at'], offset_days(to_date, 5))
def get_entity_month( entity: str, use_mapping: bool = False ) -> Optional[List[SitewideEntityStatMessage]]: """ Get the montly sitewide top entity """ current_app.logger.debug("Calculating sitewide_{}_month...".format(entity)) to_date = get_latest_listen_ts() # Set time to 00:00 to_date = datetime(to_date.year, to_date.month, to_date.day) from_date = replace_days(offset_months(to_date, 1, shift_backwards=True), 1) day = from_date # Genarate a dataframe containing days of last and current month along with start and end time time_range = [] while day < to_date: time_range.append([ day.strftime('%d %B %Y'), int(day.timestamp()), int(get_day_end(day).timestamp()) ]) day = offset_days(day, 1, shift_backwards=False) time_range_df = listenbrainz_spark.session.createDataFrame( time_range, schema=time_range_schema) time_range_df.createOrReplaceTempView('time_range') listens_df = get_listens(from_date, to_date, LISTENBRAINZ_DATA_DIRECTORY) table_name = 'sitewide_{}_month'.format(entity) listens_df.createOrReplaceTempView(table_name) handler = entity_handler_map[entity] data = handler(table_name, "dd MMMM yyyy", use_mapping) message = create_message(data=data, entity=entity, stats_range='month', from_ts=from_date.timestamp(), to_ts=to_date.timestamp()) current_app.logger.debug("Done!") return message
def test_get_listening_activity_year(self, mock_create_messages, _, mock_get_listens): listening_activity_stats.get_listening_activity('year') from_date = month = datetime(2019, 1, 1) to_date = datetime(2021, 1, 1) time_range = [] while month < to_date: time_range.append( [month.strftime('%B %Y'), month, get_month_end(month)]) month = offset_months(month, 1, shift_backwards=False) time_range_df = run_query("SELECT * FROM time_range") time_range_result = time_range_df.rdd.map(list).collect() self.assertListEqual(time_range_result, time_range) mock_get_listens.assert_called_with(from_date, to_date) mock_create_messages.assert_called_with(data='activity_table', stats_range='year', from_date=from_date, to_date=to_date)
def get_entity_year( entity: str, use_mapping: bool = False ) -> Optional[List[SitewideEntityStatMessage]]: """ Get the yearly sitewide top entity """ current_app.logger.debug("Calculating sitewide_{}_year...".format(entity)) to_date = get_latest_listen_ts() from_date = datetime(to_date.year - 1, 1, 1) month = from_date time_range = [] # Genarate a dataframe containing months of last and current year along with start and end time while month < to_date: time_range.append([ month.strftime('%B %Y'), int(month.timestamp()), int(get_month_end(month).timestamp()) ]) month = offset_months(month, 1, shift_backwards=False) time_range_df = listenbrainz_spark.session.createDataFrame( time_range, schema=time_range_schema) time_range_df.createOrReplaceTempView('time_range') listens_df = get_listens(from_date, to_date, LISTENBRAINZ_DATA_DIRECTORY) table_name = 'sitewide_{}_year'.format(entity) listens_df.createOrReplaceTempView(table_name) handler = entity_handler_map[entity] data = handler(table_name, "MMMM yyyy", use_mapping) message = create_message(data=data, entity=entity, stats_range='year', from_ts=from_date.timestamp(), to_ts=to_date.timestamp()) current_app.logger.debug("Done!") return message