Ejemplo n.º 1
0
def get_listening_activity(stats_range: str):
    """ Calculate the number of listens of users for the given stats_range """
    logger.debug(f"Calculating listening_activity_{stats_range}")

    from_date, to_date, step, date_format = get_time_range(stats_range)
    time_range = []

    period_start = from_date
    while period_start < to_date:
        period_formatted = period_start.strftime(date_format)
        # calculate the time at which this period ends i.e. 1 microsecond before the next period's start
        # here, period_start + step is next period's start
        period_end = period_start + step + relativedelta(microseconds=-1)
        time_range.append([period_formatted, period_start, period_end])
        period_start = period_start + step

    time_range_df = listenbrainz_spark.session.createDataFrame(
        time_range, time_range_schema)
    time_range_df.createOrReplaceTempView("time_range")

    get_listens_from_new_dump(from_date,
                              to_date).createOrReplaceTempView("listens")
    data = calculate_listening_activity()
    messages = create_messages(data=data,
                               stats_range=stats_range,
                               from_date=from_date,
                               to_date=to_date)

    logger.debug("Done!")

    return messages
Ejemplo n.º 2
0
def get_daily_activity(stats_range: str) -> Iterator[Optional[UserDailyActivityStatMessage]]:
    """ Calculate number of listens for an user for the specified time range """
    logger.debug(f"Calculating daily_activity_{stats_range}")

    from_date, to_date = get_dates_for_stats_range(stats_range)
    get_listens_from_new_dump(from_date, to_date).createOrReplaceTempView("listens")
    data = calculate_daily_activity()
    messages = create_messages(data=data, stats_range=stats_range, from_date=from_date, to_date=to_date)

    logger.debug("Done!")

    return messages
def calculate_listens_per_day(year):
    from_date = datetime(year, 1, 1)
    to_date = datetime.combine(date(year, 12, 31), time.max)
    step = relativedelta(days=+1)
    date_format = "%d %B %Y"

    time_range = []
    segment_start = from_date
    while segment_start < to_date:
        segment_formatted = segment_start.strftime(date_format)
        # calculate the time at which this period ends i.e. 1 microsecond before the next period's start
        # here, segment_start + step is next segment's start
        segment_end = segment_start + step + relativedelta(microseconds=-1)
        time_range.append([segment_formatted, segment_start, segment_end])
        segment_start = segment_start + step
    time_range_df = listenbrainz_spark.session.createDataFrame(
        time_range, time_range_schema)
    time_range_df.createOrReplaceTempView("time_range")

    listens = get_listens_from_new_dump(from_date, to_date)
    listens.createOrReplaceTempView("listens")

    data = calculate_listening_activity()
    return create_messages(data=data,
                           stats_range="year_in_music",
                           from_date=from_date,
                           to_date=to_date,
                           message_type="year_in_music_listens_per_day")
def get_listening_activity(stats_range: str) -> Iterator[Optional[Dict]]:
    """ Compute the number of listens for a time range compared to the previous range

    Given a time range, this computes a histogram of all listens for that range
    and the previous range of the same duration, so that they can be compared. The
    bin size of the histogram depends on the size of the range (e.g.
    year -> 12 months, month -> ~30 days, week -> ~7 days, see get_time_range for
    details). These values are used on the listening activity reports.
    """
    logger.debug(f"Calculating listening_activity_{stats_range}")
    from_date, to_date, _, _, spark_date_format = setup_time_range(stats_range)
    get_listens_from_new_dump(from_date,
                              to_date).createOrReplaceTempView("listens")
    data = calculate_listening_activity(spark_date_format)
    messages = create_messages(data=data,
                               stats_range=stats_range,
                               from_date=from_date,
                               to_date=to_date)
    logger.debug("Done!")
    return messages
Ejemplo n.º 5
0
def calculate_top_entity_stats(year):
    from_date = datetime(year, 1, 1)
    to_date = datetime.combine(date(year, 12, 31), time.max)
    table = "listens_of_year"
    listens = get_listens_from_new_dump(from_date, to_date)
    listens.createOrReplaceTempView(table)

    for entity in ["artists", "recordings", "releases"]:
        stats = calculate_entity_stats(from_date, to_date, table, entity,
                                       "this_year", "year_in_music_top_stats")
        for message in stats:
            yield message
def get_listening_activity(stats_range: str):
    """ Compute the number of listens for a time range compared to the previous range

    Given a time range, this computes a histogram of a users' listens for that range
    and the previous range of the same duration, so that they can be compared. The
    bin size of the histogram depends on the size of the range (e.g.
    year -> 12 months, month -> ~30 days, week -> ~7 days, see get_time_range for
    details). These values are used on the listening activity reports.
    """
    logger.debug(f"Calculating listening_activity_{stats_range}")

    from_date, to_date, step, date_format = get_time_range(stats_range)
    time_range = []

    period_start = from_date
    while period_start < to_date:
        period_formatted = period_start.strftime(date_format)
        # calculate the time at which this period ends i.e. 1 microsecond before the next period's start
        # here, period_start + step is next period's start
        period_end = period_start + step + relativedelta(microseconds=-1)
        time_range.append([period_formatted, period_start, period_end])
        period_start = period_start + step

    time_range_df = listenbrainz_spark.session.createDataFrame(
        time_range, time_range_schema)
    time_range_df.createOrReplaceTempView("time_range")

    get_listens_from_new_dump(from_date,
                              to_date).createOrReplaceTempView("listens")
    data = calculate_listening_activity()
    messages = create_messages(data=data,
                               stats_range=stats_range,
                               from_date=from_date,
                               to_date=to_date)

    logger.debug("Done!")

    return messages
Ejemplo n.º 7
0
def get_entity_stats(entity: str, stats_range: str, message_type="user_entity")\
        -> Iterator[Optional[Dict]]:
    """ Get the top entity for all users for specified stats_range """
    logger.debug(f"Calculating user_{entity}_{stats_range}...")

    from_date, to_date = get_dates_for_stats_range(stats_range)
    listens_df = get_listens_from_new_dump(from_date, to_date)
    table = f"user_{entity}_{stats_range}"
    listens_df.createOrReplaceTempView(table)

    messages = calculate_entity_stats(from_date, to_date, table, entity,
                                      stats_range, message_type)

    logger.debug("Done!")

    return messages
Ejemplo n.º 8
0
def get_entity_stats(entity: str, stats_range: str) -> Optional[List[SitewideEntityStatMessage]]:
    """ Returns top entity stats for given time period """
    logger.debug(f"Calculating sitewide_{entity}_{stats_range}...")

    from_date, to_date = get_dates_for_stats_range(stats_range)
    listens_df = get_listens_from_new_dump(from_date, to_date)
    table_name = f"sitewide_{entity}_{stats_range}"
    listens_df.createOrReplaceTempView(table_name)

    listen_count_limit = get_listen_count_limit(stats_range)
    handler = entity_handler_map[entity]
    data = handler(table_name, listen_count_limit)

    messages = create_messages(data=data, entity=entity, stats_range=stats_range,
                               from_date=from_date, to_date=to_date)

    logger.debug("Done!")

    return messages
Ejemplo n.º 9
0
def get_entity_stats(
        entity: str,
        stats_range: str) -> Iterator[Optional[UserEntityStatMessage]]:
    """ Get the top entity for all users for specified stats_range """
    logger.debug(f"Calculating user_{entity}_{stats_range}...")

    from_date, to_date = get_dates_for_stats_range(stats_range)
    listens_df = get_listens_from_new_dump(from_date, to_date)
    table_name = f"user_{entity}_{stats_range}"
    listens_df.createOrReplaceTempView(table_name)

    handler = entity_handler_map[entity]
    data = handler(table_name)
    messages = create_messages(data=data,
                               entity=entity,
                               stats_range=stats_range,
                               from_date=from_date,
                               to_date=to_date)

    logger.debug("Done!")

    return messages
Ejemplo n.º 10
0
def setup_listens_for_year(year):
    start = datetime(year, 1, 1)
    end = datetime.combine(date(year, 12, 31), time.max)
    listens = get_listens_from_new_dump(start, end)
    listens.createOrReplaceTempView("listens_of_year")
Ejemplo n.º 11
0
def calculate_dataframes(from_date, to_date, job_type,
                         minimum_listens_threshold):
    if job_type == "recommendation_recording":
        paths = {
            "mapped_listens": path.RECOMMENDATION_RECORDING_MAPPED_LISTENS,
            "playcounts": path.RECOMMENDATION_RECORDING_PLAYCOUNTS_DATAFRAME,
            "recordings": path.RECOMMENDATION_RECORDINGS_DATAFRAME,
            "users": path.RECOMMENDATION_RECORDING_USERS_DATAFRAME,
            "metadata": path.RECOMMENDATION_RECORDING_DATAFRAME_METADATA,
            "prefix": "listenbrainz-dataframe-recording-recommendations"
        }
    elif job_type == "similar_users":
        paths = {
            "mapped_listens": path.USER_SIMILARITY_MAPPED_LISTENS,
            "playcounts": path.USER_SIMILARITY_PLAYCOUNTS_DATAFRAME,
            "recordings": path.USER_SIMILARITY_RECORDINGS_DATAFRAME,
            "users": path.USER_SIMILARITY_USERS_DATAFRAME,
            "metadata": path.USER_SIMILARITY_METADATA_DATAFRAME,
            "prefix": "listenbrainz-dataframe-user-similarity"
        }
    else:
        raise SparkException(
            "Invalid job_type parameter received for creating dataframes: " +
            job_type)

    # dict to save dataframe metadata which would be later merged in model_metadata dataframe.
    metadata = {}
    # "updated" should always be set to False in this script.
    metadata['updated'] = False
    try:
        listenbrainz_spark.init_spark_session('Create Dataframes')
    except SparkSessionNotInitializedException as err:
        logger.error(str(err), exc_info=True)
        raise

    metadata['to_date'] = to_date
    metadata['from_date'] = from_date

    complete_listens_df = get_listens_from_new_dump(from_date, to_date)
    logger.info(
        f'Listen count from {from_date} to {to_date}: {complete_listens_df.count()}'
    )

    logger.info('Discarding listens without mbids...')
    partial_listens_df = complete_listens_df.where(
        col('recording_mbid').isNotNull())
    logger.info(f'Listen count after discarding: {partial_listens_df.count()}')

    logger.info('Thresholding listens...')
    threshold_listens_df = get_threshold_listens_df(partial_listens_df,
                                                    paths["mapped_listens"],
                                                    minimum_listens_threshold)
    logger.info(
        f'Listen count after thresholding: {threshold_listens_df.count()}')

    logger.info('Preparing users data and saving to HDFS...')
    users_df = get_users_dataframe(threshold_listens_df, metadata,
                                   paths["users"])

    logger.info('Preparing recordings data and saving to HDFS...')
    recordings_df = get_recordings_df(threshold_listens_df, metadata,
                                      paths["recordings"])

    logger.info(
        'Preparing listen data dump and playcounts, saving playcounts to HDFS...'
    )
    listens_df = get_listens_df(threshold_listens_df, metadata)

    save_playcounts_df(listens_df, recordings_df, users_df, metadata,
                       paths["playcounts"])

    metadata['dataframe_id'] = get_dataframe_id(paths["prefix"])
    save_dataframe_metadata_to_hdfs(metadata, paths["metadata"])
    return complete_listens_df
Ejemplo n.º 12
0
 def get_all_test_listens(cls):
     return get_listens_from_new_dump(cls.begin_date, cls.end_date)