Example #1
0
 def test_offset_days(self):
     d1 = stats.offset_days(datetime.datetime(2019, 5, 12), 3, shift_backwards=False)
     d2 = datetime.datetime(2019, 5, 15)
     self.assertEqual(d1, d2)
     d1 = stats.offset_days(datetime.datetime(2019, 5, 12), 3)
     d2 = datetime.datetime(2019, 5, 9)
     self.assertEqual(d1, d2)
Example #2
0
def get_listening_activity_week() -> Iterator[Optional[UserListeningActivityStatMessage]]:
    """ Calculate number of listens for an user on each day of the past and current week. """
    logger.debug("Calculating listening_activity_week")

    to_date = get_latest_listen_ts()
    from_date = offset_days(get_last_monday(to_date), 7)
    # Set time to 00:00
    from_date = datetime(from_date.year, from_date.month, from_date.day)
    day = from_date

    # Genarate a dataframe containing days of last and current week along with start and end time
    time_range = []
    while day < to_date:
        time_range.append([day.strftime('%A %d %B %Y'), day, get_day_end(day)])
        day = offset_days(day, 1, shift_backwards=False)

    time_range_df = listenbrainz_spark.session.createDataFrame(time_range, time_range_schema)
    time_range_df.createOrReplaceTempView('time_range')

    _get_listens(from_date, to_date)

    data = get_listening_activity()
    messages = create_messages(data=data, stats_range='week', from_ts=from_date.timestamp(), to_ts=to_date.timestamp())

    logger.debug("Done!")

    return messages
 def get_listens(cls):
     cls.date = datetime.utcnow()
     df1 = utils.create_dataframe(cls.get_listen_row(cls.date, 'vansika', 1), schema=None)
     shifted_date = stats.offset_days(cls.date, cls.recommendation_generation_window + 1)
     df2 = utils.create_dataframe(cls.get_listen_row(shifted_date, 'vansika', 1), schema=None)
     shifted_date = stats.offset_days(cls.date, 1)
     df3 = utils.create_dataframe(cls.get_listen_row(shifted_date, 'rob', 2), schema=None)
     shifted_date = stats.offset_days(cls.date, 2)
     df4 = utils.create_dataframe(cls.get_listen_row(shifted_date, 'rob', 2), schema=None)
     test_mapped_df = df1.union(df2).union(df3).union(df4)
     return test_mapped_df
    def test_get_listening_activity_month(self, mock_create_messages,
                                          mock_get_listening_activity,
                                          mock_get_listens,
                                          mock_get_latest_listen_ts):
        mock_df = MagicMock()
        mock_get_listens.return_value = mock_df

        listening_activity_stats.get_listening_activity_month()
        to_date = datetime(2020, 6, 19)
        from_date = day = datetime(2020, 5, 1)

        time_range = []
        while day < to_date:
            time_range.append(
                [day.strftime('%d %B %Y'), day,
                 get_day_end(day)])
            day = offset_days(day, 1, shift_backwards=False)
        time_range_df = run_query("SELECT * FROM time_range")
        time_range_result = time_range_df.rdd.map(list).collect()
        self.assertListEqual(time_range_result, time_range)

        mock_get_latest_listen_ts.assert_called_once()
        mock_get_listens.assert_called_with(from_date, to_date,
                                            LISTENBRAINZ_DATA_DIRECTORY)
        mock_df.createOrReplaceTempView.assert_called_with('listens')
        mock_create_messages.assert_called_with(
            data='listening_activity_table',
            stats_range='month',
            from_ts=from_date.timestamp(),
            to_ts=to_date.timestamp())
    def test_get_entity_week(self, mock_create_message, mock_filter_listens,
                             mock_get_listens, mock_get_latest_listen_ts):
        mock_df = MagicMock()
        mock_get_listens.return_value = mock_df
        mock_filtered_df = MagicMock()
        mock_filter_listens.return_value = mock_filtered_df

        entity_stats.get_entity_week('test', False)
        from_date = datetime(2020, 8, 3)
        to_date = datetime(2020, 8, 17)
        day = from_date

        time_range = []
        while day < to_date:
            time_range.append([day.strftime('%A %d %B %Y'), int(day.timestamp()), int(get_day_end(day).timestamp())])
            day = offset_days(day, 1, shift_backwards=False)
        time_range_df = run_query("SELECT * FROM time_range")
        time_range_result = time_range_df.rdd.map(list).collect()
        self.assertListEqual(time_range_result, time_range)

        mock_get_latest_listen_ts.assert_called_once()
        mock_get_listens.assert_called_with(from_date, to_date, LISTENBRAINZ_DATA_DIRECTORY)
        mock_filter_listens.assert_called_with(mock_df, from_date, to_date)
        mock_filtered_df.createOrReplaceTempView.assert_called_with('sitewide_test_week')
        mock_create_message.assert_called_with(data='sitewide_test_week_data', entity='test', stats_range='week',
                                               from_ts=from_date.timestamp(), to_ts=to_date.timestamp())
Example #6
0
 def test_get_dates_to_train_data(self):
     train_model_window = 20
     to_date, from_date = create_dataframes.get_dates_to_train_data(
         train_model_window)
     d = stats.offset_days(to_date, train_model_window)
     d = stats.replace_days(d, 1)
     self.assertEqual(from_date, d)
 def test_get_dates_to_generate_candidate_sets(self):
     mapped_df = self.get_listens()
     from_date, to_date = candidate_sets.get_dates_to_generate_candidate_sets(mapped_df,
                                                                              self.recommendation_generation_window)
     self.assertEqual(to_date, self.date)
     expected_date = stats.offset_days(self.date, self.recommendation_generation_window).replace(hour=0, minute=0, second=0)
     self.assertEqual(from_date, expected_date)
 def test_get_listens_for_training_model_window(self):
     to_date = get_latest_listen_ts()
     from_date = stats.offset_days(to_date, 2)
     print(to_date, from_date)
     test_df = dataframe_utils.get_listens_for_training_model_window(to_date, from_date, self.listens_path)
     self.assertIn('artist_name_matchable', test_df.columns)
     self.assertIn('track_name_matchable', test_df.columns)
     self.assertEqual(test_df.count(), 11)
 def test_get_dates_to_train_data(self):
     train_model_window = 12
     to_date, from_date = dataframe_utils.get_dates_to_train_data(train_model_window)
     d = stats.offset_days(to_date, train_model_window)
     d = stats.replace_days(d, 1)
     # refer to testdata/listens.json
     self.assertEqual(to_date, datetime(2019, 1, 21, 0, 0))
     self.assertEqual(from_date, d)
 def upload_test_mapping_listens_subset_to_hdfs(cls):
     mapped_df = utils.read_files_from_HDFS(cls.mapped_listens_path)
     from_date = stats.offset_days(cls.date, 4)
     to_date = cls.date
     mapped_listens_subset = candidate_sets.get_listens_to_fetch_top_artists(
         mapped_df, from_date, to_date)
     utils.save_parquet(mapped_listens_subset,
                        cls.mapped_listens_subset_path)
def get_entity_week(
        entity: str,
        use_mapping: bool = False
) -> Optional[List[SitewideEntityStatMessage]]:
    """ Get the weekly sitewide top entity """
    current_app.logger.debug("Calculating sitewide_{}_week...".format(entity))

    date = get_latest_listen_ts()

    to_date = get_last_monday(date)
    # Set time to 00:00
    to_date = datetime(to_date.year, to_date.month, to_date.day)
    from_date = offset_days(to_date, 14)
    day = from_date

    # Genarate a dataframe containing days of last and current week along with start and end time
    time_range = []
    while day < to_date:
        time_range.append([
            day.strftime('%A %d %B %Y'),
            int(day.timestamp()),
            int(get_day_end(day).timestamp())
        ])
        day = offset_days(day, 1, shift_backwards=False)

    time_range_df = listenbrainz_spark.session.createDataFrame(
        time_range, schema=time_range_schema)
    time_range_df.createOrReplaceTempView('time_range')

    listens_df = get_listens(from_date, to_date, LISTENBRAINZ_DATA_DIRECTORY)
    filtered_df = filter_listens(listens_df, from_date, to_date)
    table_name = 'sitewide_{}_week'.format(entity)
    filtered_df.createOrReplaceTempView(table_name)

    handler = entity_handler_map[entity]
    data = handler(table_name, "EEEE dd MMMM yyyy", use_mapping)
    message = create_message(data=data,
                             entity=entity,
                             stats_range='week',
                             from_ts=from_date.timestamp(),
                             to_ts=to_date.timestamp())

    current_app.logger.debug("Done!")

    return message
    def test_get_latest_listen_ts(self):
        date = datetime(2020, 5, 18)
        df = utils.create_dataframe(Row(listened_at=date), schema=None)
        df = df.union(
            utils.create_dataframe(Row(listened_at=offset_days(date, 7)),
                                   schema=None))
        utils.save_parquet(df, '{}/2020/5.parquet'.format(self.path_))

        result = stats_utils.get_latest_listen_ts()
        self.assertEqual(date, result)
 def test_get_listens_for_training_model_window(self):
     metadata = {}
     to_date = get_latest_listen_ts()
     from_date = stats.offset_days(to_date, 2)
     test_df = create_dataframes.get_listens_for_training_model_window(
         to_date, from_date, metadata, self.listens_path)
     self.assertEqual(metadata['to_date'], to_date)
     self.assertEqual(metadata['from_date'], from_date)
     self.assertNotIn('artist_mbids', test_df.columns)
     self.assertNotIn('recording_mbid', test_df.columns)
Example #14
0
def get_dates_to_train_data(train_model_window):
    """ Get window to fetch listens to train data.

        Args:
            train_model_window (int): model to be trained on data of given number of days.

        Returns:
            from_date (datetime): Date from which start fetching listens.
            to_date (datetime): Date upto which fetch listens.
    """
    to_date = get_latest_listen_ts()
    from_date = offset_days(to_date, train_model_window)
    # shift to the first of the month
    from_date = replace_days(from_date, 1)
    return to_date, from_date
    def test_filter_listens(self):
        from_date = datetime(2020, 5, 1)
        to_date = datetime(2020, 5, 31)

        df = utils.create_dataframe(
            Row(listened_at=offset_months(from_date, 1)), None)
        df = df.union(
            utils.create_dataframe(
                Row(listened_at=offset_months(
                    to_date, 1, shift_backwards=False)), None))
        df = df.union(
            utils.create_dataframe(
                Row(listened_at=offset_days(
                    from_date, 5, shift_backwards=False)), None))
        df = df.union(
            utils.create_dataframe(Row(listened_at=offset_days(to_date, 5)),
                                   None))

        result = stats_utils.filter_listens(df, from_date, to_date)
        rows = result.collect()

        self.assertEqual(rows[0]['listened_at'],
                         offset_days(from_date, 5, shift_backwards=False))
        self.assertEqual(rows[1]['listened_at'], offset_days(to_date, 5))
def get_daily_activity_week() -> Iterator[Optional[UserDailyActivityStatMessage]]:
    """ Calculate number of listens for an user per hour on each day of the past week. """
    logger.debug("Calculating daily_activity_week")

    date = get_latest_listen_ts()
    to_date = get_last_monday(date)
    from_date = offset_days(to_date, 7)

    _get_listens(from_date, to_date)

    data = get_daily_activity()

    messages = create_messages(data=data, stats_range='week', from_ts=from_date.timestamp(), to_ts=to_date.timestamp())

    logger.debug("Done!")

    return messages
    def setUpClass(cls):
        super(CandidateSetsTestClass, cls).setUpClass()
        cls.mapped_listens_df = listenbrainz_spark \
            .session \
            .read \
            .parquet("file://" + os.path.join(TEST_DATA_PATH, 'mapped_listens_candidate_sets.parquet')) \
            .where("recording_mbid IS NOT NULL")

        to_date = datetime(2019, 1, 21, tzinfo=timezone.utc)
        from_date = stats.offset_days(to_date, 4)

        cls.mapped_listens_subset = candidate_sets.get_listens_to_fetch_top_artists(
            cls.mapped_listens_df, from_date, to_date)

        cls.recordings_df = create_dataframes.get_recordings_df(
            cls.mapped_listens_df, {}, RECOMMENDATION_RECORDINGS_DATAFRAME)

        cls.users_df = create_dataframes.get_users_dataframe(
            cls.mapped_listens_df, {},
            RECOMMENDATION_RECORDING_USERS_DATAFRAME)
Example #18
0
def get_dates_to_generate_candidate_sets(mapped_listens_df,
                                         recommendation_generation_window):
    """ Get window to fetch listens to generate candidate sets.

        Args:
            mapped_listens_df (dataframe): listens mapped with msid_mbid_mapping. Refer to create_dataframe.py
                                           for dataframe columns.
            recommendation_generation_window (int): recommendations to be generated on history of given number of days.

        Returns:
            from_date (datetime): Date from which start fetching listens.
            to_date (datetime): Date upto which fetch listens.
    """
    # get timestamp of latest listen in HDFS
    to_date = mapped_listens_df.select(
        func.max('listened_at').alias('listened_at')).collect()[0].listened_at
    from_date = stats.offset_days(
        to_date, recommendation_generation_window).replace(hour=0,
                                                           minute=0,
                                                           second=0)
    return from_date, to_date
    def test_get_listening_activity_month(self, mock_create_messages, _,
                                          mock_get_listens):
        listening_activity_stats.get_listening_activity('month')

        from_date = day = datetime(2021, 6, 1)
        to_date = datetime(2021, 8, 1)
        time_range = []
        while day < to_date:
            time_range.append(
                [day.strftime('%d %B %Y'), day,
                 get_day_end(day)])
            day = offset_days(day, 1, shift_backwards=False)
        time_range_df = run_query("SELECT * FROM time_range")
        time_range_result = time_range_df.rdd.map(list).collect()
        self.assertListEqual(time_range_result, time_range)

        mock_get_listens.assert_called_with(from_date, to_date)
        mock_create_messages.assert_called_with(data='activity_table',
                                                stats_range='month',
                                                from_date=from_date,
                                                to_date=to_date)
Example #20
0
def get_entity_week(entity: str) -> Iterator[Optional[UserEntityStatMessage]]:
    """ Get the weekly top entity for all users """
    logger.debug("Calculating {}_week...".format(entity))

    date = get_latest_listen_ts()

    to_date = get_last_monday(date)
    from_date = offset_days(to_date, 7)

    listens_df = get_listens(from_date, to_date, LISTENBRAINZ_DATA_DIRECTORY)
    filtered_df = filter_listens(listens_df, from_date, to_date)
    table_name = 'user_{}_week'.format(entity)
    filtered_df.createOrReplaceTempView(table_name)

    handler = entity_handler_map[entity]
    data = handler(table_name)
    messages = create_messages(data=data, entity=entity, stats_range='week',
                               from_ts=from_date.timestamp(), to_ts=to_date.timestamp())

    logger.debug("Done!")

    return messages
Example #21
0
def get_last_monday(date: datetime) -> datetime:
    """ Get date for Monday before 'date' """
    return offset_days(date, date.weekday())