def test_offset_days(self): d1 = stats.offset_days(datetime.datetime(2019, 5, 12), 3, shift_backwards=False) d2 = datetime.datetime(2019, 5, 15) self.assertEqual(d1, d2) d1 = stats.offset_days(datetime.datetime(2019, 5, 12), 3) d2 = datetime.datetime(2019, 5, 9) self.assertEqual(d1, d2)
def get_listening_activity_week() -> Iterator[Optional[UserListeningActivityStatMessage]]: """ Calculate number of listens for an user on each day of the past and current week. """ logger.debug("Calculating listening_activity_week") to_date = get_latest_listen_ts() from_date = offset_days(get_last_monday(to_date), 7) # Set time to 00:00 from_date = datetime(from_date.year, from_date.month, from_date.day) day = from_date # Genarate a dataframe containing days of last and current week along with start and end time time_range = [] while day < to_date: time_range.append([day.strftime('%A %d %B %Y'), day, get_day_end(day)]) day = offset_days(day, 1, shift_backwards=False) time_range_df = listenbrainz_spark.session.createDataFrame(time_range, time_range_schema) time_range_df.createOrReplaceTempView('time_range') _get_listens(from_date, to_date) data = get_listening_activity() messages = create_messages(data=data, stats_range='week', from_ts=from_date.timestamp(), to_ts=to_date.timestamp()) logger.debug("Done!") return messages
def get_listens(cls): cls.date = datetime.utcnow() df1 = utils.create_dataframe(cls.get_listen_row(cls.date, 'vansika', 1), schema=None) shifted_date = stats.offset_days(cls.date, cls.recommendation_generation_window + 1) df2 = utils.create_dataframe(cls.get_listen_row(shifted_date, 'vansika', 1), schema=None) shifted_date = stats.offset_days(cls.date, 1) df3 = utils.create_dataframe(cls.get_listen_row(shifted_date, 'rob', 2), schema=None) shifted_date = stats.offset_days(cls.date, 2) df4 = utils.create_dataframe(cls.get_listen_row(shifted_date, 'rob', 2), schema=None) test_mapped_df = df1.union(df2).union(df3).union(df4) return test_mapped_df
def test_get_listening_activity_month(self, mock_create_messages, mock_get_listening_activity, mock_get_listens, mock_get_latest_listen_ts): mock_df = MagicMock() mock_get_listens.return_value = mock_df listening_activity_stats.get_listening_activity_month() to_date = datetime(2020, 6, 19) from_date = day = datetime(2020, 5, 1) time_range = [] while day < to_date: time_range.append( [day.strftime('%d %B %Y'), day, get_day_end(day)]) day = offset_days(day, 1, shift_backwards=False) time_range_df = run_query("SELECT * FROM time_range") time_range_result = time_range_df.rdd.map(list).collect() self.assertListEqual(time_range_result, time_range) mock_get_latest_listen_ts.assert_called_once() mock_get_listens.assert_called_with(from_date, to_date, LISTENBRAINZ_DATA_DIRECTORY) mock_df.createOrReplaceTempView.assert_called_with('listens') mock_create_messages.assert_called_with( data='listening_activity_table', stats_range='month', from_ts=from_date.timestamp(), to_ts=to_date.timestamp())
def test_get_entity_week(self, mock_create_message, mock_filter_listens, mock_get_listens, mock_get_latest_listen_ts): mock_df = MagicMock() mock_get_listens.return_value = mock_df mock_filtered_df = MagicMock() mock_filter_listens.return_value = mock_filtered_df entity_stats.get_entity_week('test', False) from_date = datetime(2020, 8, 3) to_date = datetime(2020, 8, 17) day = from_date time_range = [] while day < to_date: time_range.append([day.strftime('%A %d %B %Y'), int(day.timestamp()), int(get_day_end(day).timestamp())]) day = offset_days(day, 1, shift_backwards=False) time_range_df = run_query("SELECT * FROM time_range") time_range_result = time_range_df.rdd.map(list).collect() self.assertListEqual(time_range_result, time_range) mock_get_latest_listen_ts.assert_called_once() mock_get_listens.assert_called_with(from_date, to_date, LISTENBRAINZ_DATA_DIRECTORY) mock_filter_listens.assert_called_with(mock_df, from_date, to_date) mock_filtered_df.createOrReplaceTempView.assert_called_with('sitewide_test_week') mock_create_message.assert_called_with(data='sitewide_test_week_data', entity='test', stats_range='week', from_ts=from_date.timestamp(), to_ts=to_date.timestamp())
def test_get_dates_to_train_data(self): train_model_window = 20 to_date, from_date = create_dataframes.get_dates_to_train_data( train_model_window) d = stats.offset_days(to_date, train_model_window) d = stats.replace_days(d, 1) self.assertEqual(from_date, d)
def test_get_dates_to_generate_candidate_sets(self): mapped_df = self.get_listens() from_date, to_date = candidate_sets.get_dates_to_generate_candidate_sets(mapped_df, self.recommendation_generation_window) self.assertEqual(to_date, self.date) expected_date = stats.offset_days(self.date, self.recommendation_generation_window).replace(hour=0, minute=0, second=0) self.assertEqual(from_date, expected_date)
def test_get_listens_for_training_model_window(self): to_date = get_latest_listen_ts() from_date = stats.offset_days(to_date, 2) print(to_date, from_date) test_df = dataframe_utils.get_listens_for_training_model_window(to_date, from_date, self.listens_path) self.assertIn('artist_name_matchable', test_df.columns) self.assertIn('track_name_matchable', test_df.columns) self.assertEqual(test_df.count(), 11)
def test_get_dates_to_train_data(self): train_model_window = 12 to_date, from_date = dataframe_utils.get_dates_to_train_data(train_model_window) d = stats.offset_days(to_date, train_model_window) d = stats.replace_days(d, 1) # refer to testdata/listens.json self.assertEqual(to_date, datetime(2019, 1, 21, 0, 0)) self.assertEqual(from_date, d)
def upload_test_mapping_listens_subset_to_hdfs(cls): mapped_df = utils.read_files_from_HDFS(cls.mapped_listens_path) from_date = stats.offset_days(cls.date, 4) to_date = cls.date mapped_listens_subset = candidate_sets.get_listens_to_fetch_top_artists( mapped_df, from_date, to_date) utils.save_parquet(mapped_listens_subset, cls.mapped_listens_subset_path)
def get_entity_week( entity: str, use_mapping: bool = False ) -> Optional[List[SitewideEntityStatMessage]]: """ Get the weekly sitewide top entity """ current_app.logger.debug("Calculating sitewide_{}_week...".format(entity)) date = get_latest_listen_ts() to_date = get_last_monday(date) # Set time to 00:00 to_date = datetime(to_date.year, to_date.month, to_date.day) from_date = offset_days(to_date, 14) day = from_date # Genarate a dataframe containing days of last and current week along with start and end time time_range = [] while day < to_date: time_range.append([ day.strftime('%A %d %B %Y'), int(day.timestamp()), int(get_day_end(day).timestamp()) ]) day = offset_days(day, 1, shift_backwards=False) time_range_df = listenbrainz_spark.session.createDataFrame( time_range, schema=time_range_schema) time_range_df.createOrReplaceTempView('time_range') listens_df = get_listens(from_date, to_date, LISTENBRAINZ_DATA_DIRECTORY) filtered_df = filter_listens(listens_df, from_date, to_date) table_name = 'sitewide_{}_week'.format(entity) filtered_df.createOrReplaceTempView(table_name) handler = entity_handler_map[entity] data = handler(table_name, "EEEE dd MMMM yyyy", use_mapping) message = create_message(data=data, entity=entity, stats_range='week', from_ts=from_date.timestamp(), to_ts=to_date.timestamp()) current_app.logger.debug("Done!") return message
def test_get_latest_listen_ts(self): date = datetime(2020, 5, 18) df = utils.create_dataframe(Row(listened_at=date), schema=None) df = df.union( utils.create_dataframe(Row(listened_at=offset_days(date, 7)), schema=None)) utils.save_parquet(df, '{}/2020/5.parquet'.format(self.path_)) result = stats_utils.get_latest_listen_ts() self.assertEqual(date, result)
def test_get_listens_for_training_model_window(self): metadata = {} to_date = get_latest_listen_ts() from_date = stats.offset_days(to_date, 2) test_df = create_dataframes.get_listens_for_training_model_window( to_date, from_date, metadata, self.listens_path) self.assertEqual(metadata['to_date'], to_date) self.assertEqual(metadata['from_date'], from_date) self.assertNotIn('artist_mbids', test_df.columns) self.assertNotIn('recording_mbid', test_df.columns)
def get_dates_to_train_data(train_model_window): """ Get window to fetch listens to train data. Args: train_model_window (int): model to be trained on data of given number of days. Returns: from_date (datetime): Date from which start fetching listens. to_date (datetime): Date upto which fetch listens. """ to_date = get_latest_listen_ts() from_date = offset_days(to_date, train_model_window) # shift to the first of the month from_date = replace_days(from_date, 1) return to_date, from_date
def test_filter_listens(self): from_date = datetime(2020, 5, 1) to_date = datetime(2020, 5, 31) df = utils.create_dataframe( Row(listened_at=offset_months(from_date, 1)), None) df = df.union( utils.create_dataframe( Row(listened_at=offset_months( to_date, 1, shift_backwards=False)), None)) df = df.union( utils.create_dataframe( Row(listened_at=offset_days( from_date, 5, shift_backwards=False)), None)) df = df.union( utils.create_dataframe(Row(listened_at=offset_days(to_date, 5)), None)) result = stats_utils.filter_listens(df, from_date, to_date) rows = result.collect() self.assertEqual(rows[0]['listened_at'], offset_days(from_date, 5, shift_backwards=False)) self.assertEqual(rows[1]['listened_at'], offset_days(to_date, 5))
def get_daily_activity_week() -> Iterator[Optional[UserDailyActivityStatMessage]]: """ Calculate number of listens for an user per hour on each day of the past week. """ logger.debug("Calculating daily_activity_week") date = get_latest_listen_ts() to_date = get_last_monday(date) from_date = offset_days(to_date, 7) _get_listens(from_date, to_date) data = get_daily_activity() messages = create_messages(data=data, stats_range='week', from_ts=from_date.timestamp(), to_ts=to_date.timestamp()) logger.debug("Done!") return messages
def setUpClass(cls): super(CandidateSetsTestClass, cls).setUpClass() cls.mapped_listens_df = listenbrainz_spark \ .session \ .read \ .parquet("file://" + os.path.join(TEST_DATA_PATH, 'mapped_listens_candidate_sets.parquet')) \ .where("recording_mbid IS NOT NULL") to_date = datetime(2019, 1, 21, tzinfo=timezone.utc) from_date = stats.offset_days(to_date, 4) cls.mapped_listens_subset = candidate_sets.get_listens_to_fetch_top_artists( cls.mapped_listens_df, from_date, to_date) cls.recordings_df = create_dataframes.get_recordings_df( cls.mapped_listens_df, {}, RECOMMENDATION_RECORDINGS_DATAFRAME) cls.users_df = create_dataframes.get_users_dataframe( cls.mapped_listens_df, {}, RECOMMENDATION_RECORDING_USERS_DATAFRAME)
def get_dates_to_generate_candidate_sets(mapped_listens_df, recommendation_generation_window): """ Get window to fetch listens to generate candidate sets. Args: mapped_listens_df (dataframe): listens mapped with msid_mbid_mapping. Refer to create_dataframe.py for dataframe columns. recommendation_generation_window (int): recommendations to be generated on history of given number of days. Returns: from_date (datetime): Date from which start fetching listens. to_date (datetime): Date upto which fetch listens. """ # get timestamp of latest listen in HDFS to_date = mapped_listens_df.select( func.max('listened_at').alias('listened_at')).collect()[0].listened_at from_date = stats.offset_days( to_date, recommendation_generation_window).replace(hour=0, minute=0, second=0) return from_date, to_date
def test_get_listening_activity_month(self, mock_create_messages, _, mock_get_listens): listening_activity_stats.get_listening_activity('month') from_date = day = datetime(2021, 6, 1) to_date = datetime(2021, 8, 1) time_range = [] while day < to_date: time_range.append( [day.strftime('%d %B %Y'), day, get_day_end(day)]) day = offset_days(day, 1, shift_backwards=False) time_range_df = run_query("SELECT * FROM time_range") time_range_result = time_range_df.rdd.map(list).collect() self.assertListEqual(time_range_result, time_range) mock_get_listens.assert_called_with(from_date, to_date) mock_create_messages.assert_called_with(data='activity_table', stats_range='month', from_date=from_date, to_date=to_date)
def get_entity_week(entity: str) -> Iterator[Optional[UserEntityStatMessage]]: """ Get the weekly top entity for all users """ logger.debug("Calculating {}_week...".format(entity)) date = get_latest_listen_ts() to_date = get_last_monday(date) from_date = offset_days(to_date, 7) listens_df = get_listens(from_date, to_date, LISTENBRAINZ_DATA_DIRECTORY) filtered_df = filter_listens(listens_df, from_date, to_date) table_name = 'user_{}_week'.format(entity) filtered_df.createOrReplaceTempView(table_name) handler = entity_handler_map[entity] data = handler(table_name) messages = create_messages(data=data, entity=entity, stats_range='week', from_ts=from_date.timestamp(), to_ts=to_date.timestamp()) logger.debug("Done!") return messages
def get_last_monday(date: datetime) -> datetime: """ Get date for Monday before 'date' """ return offset_days(date, date.weekday())