Beispiel #1
0
def get_listening_activity_month() -> Iterator[Optional[UserListeningActivityStatMessage]]:
    """ Calculate number of listens for an user on each day of the past month and current month. """
    current_app.logger.debug("Calculating listening_activity_month")

    to_date = get_latest_listen_ts()
    # Set time to 00:00
    to_date = datetime(to_date.year, to_date.month, to_date.day)
    from_date = offset_months(replace_days(to_date, 1), 1)
    day = offset_months(replace_days(to_date, 1), 1)

    # Genarate a dataframe containing days of last and current month along with start and end time
    time_range = []
    while day < to_date:
        time_range.append([day.strftime('%d %B %Y'), day, get_day_end(day)])
        day = offset_days(day, 1, shift_backwards=False)

    time_range_df = listenbrainz_spark.session.createDataFrame(time_range, time_range_schema)
    time_range_df.createOrReplaceTempView('time_range')

    _get_listens(from_date, to_date)

    data = get_listening_activity()
    messages = create_messages(data=data, stats_range='month', from_ts=from_date.timestamp(), to_ts=to_date.timestamp())

    current_app.logger.debug("Done!")

    return messages
Beispiel #2
0
 def test_get_dates_to_train_data(self):
     train_model_window = 20
     to_date, from_date = create_dataframes.get_dates_to_train_data(
         train_model_window)
     d = stats.offset_days(to_date, train_model_window)
     d = stats.replace_days(d, 1)
     self.assertEqual(from_date, d)
Beispiel #3
0
def get_listens(from_date, to_date, dest_path):
    """ Prepare dataframe of months falling between from_date and to_date (both inclusive).

        Args:
            from_date (datetime): Date from which start fetching listens.
            to_date (datetime): Date upto which fetch listens.

        Returns:
            df (dataframe): Columns can be depicted as:
                [
                    'artist_mbids', 'artist_msid', 'artist_name', 'listened_at', 'recording_mbid'
                    'recording_msid', 'release_mbid', 'release_msid', 'release_name', 'tags',
                    'track_name', 'user_name'
                ]
    """
    if to_date < from_date:
        raise ValueError('{}: Data generation window is negative i.e. from_date (date from which start fetching listens)' \
            ' is greater than to_date (date upto which fetch listens).\nAborting...'.format(type(err).__name__))
    df = None
    while from_date <= to_date:
        try:
            month = read_files_from_HDFS('{}/{}/{}.parquet'.format(
                dest_path, from_date.year, from_date.month))
            df = df.union(month) if df else month
        except PathNotFoundException as err:
            current_app.logger.warning(
                '{}\nFetching file for next date...'.format(err))
        # go to the next month of from_date
        from_date = stats.adjust_days(from_date,
                                      config.STEPS_TO_REACH_NEXT_MONTH,
                                      shift_backwards=False)
        # shift to the first of the month
        from_date = stats.replace_days(from_date, 1)
    return df
def get_listens(from_date, to_date, dest_path):
    """ Prepare dataframe of months falling between from_date and to_date (both inclusive).

        Args:
            from_date (datetime): Date from which start fetching listens.
            to_date (datetime): Date upto which fetch listens.
            dest_path (str): HDFS path to fetch listens from.

        Returns:
            df: Dataframe of listens.
    """
    if to_date < from_date:
        raise ValueError(
            '{}: Data generation window is negative i.e. from_date (date from which start fetching listens)'
            ' is greater than to_date (date upto which fetch listens).'.format(
                type(ValueError).__name__))
    df = None
    while from_date <= to_date:
        try:
            month = read_files_from_HDFS('{}/{}/{}.parquet'.format(
                dest_path, from_date.year, from_date.month))
            df = df.union(month) if df else month
        except PathNotFoundException as err:
            current_app.logger.debug(
                '{}\nFetching file for next date...'.format(err))
        # go to the next month of from_date
        from_date = stats.offset_months(date=from_date,
                                        months=1,
                                        shift_backwards=False)
        # shift to the first of the month
        from_date = stats.replace_days(from_date, 1)
    if not df:
        current_app.logger.error('Listening history missing form HDFS')
        raise HDFSException("Listening history missing from HDFS")
    return df
def get_listens_for_training_model_window(metadata):
    """  Prepare dataframe of listens of X days to train. Here X is a config value.

        Returns:
            training_df (dataframe): Columns can de depicted as:
                [
                    artist_mbids, artist_msid, artist_name, listened_at, recording_mbid,
                    recording_msid, release_mbid, release_msid, release_name, tags, track_name, user_name
                ]
    """
    to_date = datetime.utcnow()
    from_date = stats.adjust_days(to_date, config.TRAIN_MODEL_WINDOW)
    # shift to the first of the month
    from_date = stats.replace_days(from_date, 1)

    metadata['to_date'] = to_date
    metadata['from_date'] = from_date
    try:
        training_df = utils.get_listens(
            from_date, to_date,
            config.HDFS_CLUSTER_URI + path.LISTENBRAINZ_DATA_DIRECTORY)
    except ValueError as err:
        current_app.logger.error(str(err), exc_info=True)
        sys.exit(-1)
    except FileNotFetchedException as err:
        current_app.logger.error(str(err), exc_info=True)
        sys.exit(-1)
    return training_df
Beispiel #6
0
def get_listens_for_rec_generation_window():
    """ Prepare dataframe of listens of X days to generate recommendations. Here X is a config value.

        Returns:
            df (dataframe): Columns can de depicted as:
                [
                    artist_mbids, artist_msid, artist_name, listened_at, recording_mbid,
                    recording_msid, release_mbid, release_msid, release_name, tags, track_name, user_name
                ]
    """
    to_date = datetime.utcnow()
    from_date = stats.adjust_days(to_date,
                                  config.RECOMMENDATION_GENERATION_WINDOW)
    # shift to the first of the month
    from_date = stats.replace_days(from_date, 1)
    try:
        df = utils.get_listens(
            from_date, to_date,
            config.HDFS_CLUSTER_URI + path.LISTENBRAINZ_DATA_DIRECTORY)
    except ValueError as err:
        current_app.logger.error(str(err), exc_info=True)
        sys.exit(-1)
    except FileNotFetchedException as err:
        current_app.logger.error(str(err), exc_info=True)
        sys.exit(-1)
    return df
 def test_get_dates_to_train_data(self):
     train_model_window = 12
     to_date, from_date = dataframe_utils.get_dates_to_train_data(train_model_window)
     d = stats.offset_days(to_date, train_model_window)
     d = stats.replace_days(d, 1)
     # refer to testdata/listens.json
     self.assertEqual(to_date, datetime(2019, 1, 21, 0, 0))
     self.assertEqual(from_date, d)
Beispiel #8
0
def get_dates_to_train_data():
    """ Get window to fetch listens to train data.

        Returns:
            from_date (datetime): Date from which start fetching listens.
            to_date (datetime): Date upto which fetch listens.
    """
    to_date = datetime.utcnow()
    from_date = stats.adjust_days(to_date, config.TRAIN_MODEL_WINDOW)
    # shift to the first of the month
    from_date = stats.replace_days(from_date, 1)
    return to_date, from_date
Beispiel #9
0
def get_dates_to_train_data(train_model_window):
    """ Get window to fetch listens to train data.

        Args:
            train_model_window (int): model to be trained on data of given number of days.

        Returns:
            from_date (datetime): Date from which start fetching listens.
            to_date (datetime): Date upto which fetch listens.
    """
    to_date = get_latest_listen_ts()
    from_date = offset_days(to_date, train_model_window)
    # shift to the first of the month
    from_date = replace_days(from_date, 1)
    return to_date, from_date
def get_daily_activity_month() -> Iterator[Optional[UserDailyActivityStatMessage]]:
    """ Calculate number of listens for an user per hour on each day of week of the current month. """
    logger.debug("Calculating daily_activity_month")

    to_date = get_latest_listen_ts()
    from_date = replace_days(to_date, 1)
    # Set time to 00:00
    from_date = datetime(from_date.year, from_date.month, from_date.day)

    _get_listens(from_date, to_date)

    data = get_daily_activity()
    messages = create_messages(data=data, stats_range='month', from_ts=from_date.timestamp(), to_ts=to_date.timestamp())

    logger.debug("Done!")

    return messages
def get_entity_month(
        entity: str,
        use_mapping: bool = False
) -> Optional[List[SitewideEntityStatMessage]]:
    """ Get the montly sitewide top entity """
    current_app.logger.debug("Calculating sitewide_{}_month...".format(entity))

    to_date = get_latest_listen_ts()
    # Set time to 00:00
    to_date = datetime(to_date.year, to_date.month, to_date.day)
    from_date = replace_days(offset_months(to_date, 1, shift_backwards=True),
                             1)
    day = from_date

    # Genarate a dataframe containing days of last and current month along with start and end time
    time_range = []
    while day < to_date:
        time_range.append([
            day.strftime('%d %B %Y'),
            int(day.timestamp()),
            int(get_day_end(day).timestamp())
        ])
        day = offset_days(day, 1, shift_backwards=False)

    time_range_df = listenbrainz_spark.session.createDataFrame(
        time_range, schema=time_range_schema)
    time_range_df.createOrReplaceTempView('time_range')

    listens_df = get_listens(from_date, to_date, LISTENBRAINZ_DATA_DIRECTORY)
    table_name = 'sitewide_{}_month'.format(entity)
    listens_df.createOrReplaceTempView(table_name)

    handler = entity_handler_map[entity]
    data = handler(table_name, "dd MMMM yyyy", use_mapping)

    message = create_message(data=data,
                             entity=entity,
                             stats_range='month',
                             from_ts=from_date.timestamp(),
                             to_ts=to_date.timestamp())

    current_app.logger.debug("Done!")

    return message
Beispiel #12
0
def get_entity_year(entity: str) -> Iterator[Optional[UserEntityStatMessage]]:
    """ Get the year top entity for all users """
    logger.debug("Calculating {}_year...".format(entity))

    to_date = get_latest_listen_ts()
    from_date = replace_days(replace_months(to_date, 1), 1)

    listens_df = get_listens(from_date, to_date, LISTENBRAINZ_DATA_DIRECTORY)
    table_name = 'user_{}_year'.format(entity)
    listens_df.createOrReplaceTempView(table_name)

    handler = entity_handler_map[entity]
    data = handler(table_name)
    messages = create_messages(data=data, entity=entity, stats_range='year',
                               from_ts=from_date.timestamp(), to_ts=to_date.timestamp())

    logger.debug("Done!")

    return messages
Beispiel #13
0
 def test_replace_days(self):
     self.assertEqual(stats.replace_days(datetime.datetime(2019, 5, 12), 13), datetime.datetime(2019, 5, 13))
 def test_get_dates_to_train_data(self):
     to_date, from_date = create_dataframes.get_dates_to_train_data()
     d = stats.adjust_days(to_date, config.TRAIN_MODEL_WINDOW)
     d = stats.replace_days(d, 1)
     self.assertEqual(from_date, d)