def test_get_entity_all_time(self, mock_create_message, mock_get_listens,
                                 mock_get_latest_listen_ts):
        mock_df = MagicMock()
        mock_get_listens.return_value = mock_df

        entity_stats.get_entity_all_time('test', False)
        from_date = datetime(LAST_FM_FOUNDING_YEAR, 1, 1)
        to_date = datetime(2020, 8, 21)
        month = from_date

        time_range = [[
            str(year),
            int(datetime(year, 1, 1).timestamp()),
            int(get_year_end(year).timestamp())
        ] for year in range(from_date.year, to_date.year + 1)]
        time_range_df = run_query("SELECT * FROM time_range")
        time_range_result = time_range_df.rdd.map(list).collect()
        self.assertListEqual(time_range_result, time_range)

        mock_get_latest_listen_ts.assert_called_once()
        mock_get_listens.assert_called_with(from_date, to_date,
                                            LISTENBRAINZ_DATA_DIRECTORY)
        mock_df.createOrReplaceTempView.assert_called_with(
            'sitewide_test_all_time')
        mock_create_message.assert_called_with(
            data='sitewide_test_all_time_data',
            entity='test',
            stats_range='all_time',
            from_ts=from_date.timestamp(),
            to_ts=to_date.timestamp())
    def test_get_listening_activity_year(self, mock_create_messages,
                                         mock_get_listening_activity,
                                         mock_get_listens,
                                         mock_get_latest_listen_ts):
        mock_df = MagicMock()
        mock_get_listens.return_value = mock_df

        listening_activity_stats.get_listening_activity_year()
        to_date = datetime(2020, 6, 19)
        from_date = month = datetime(2019, 1, 1)

        time_range = []
        while month < to_date:
            time_range.append(
                [month.strftime('%B %Y'), month,
                 get_month_end(month)])
            month = offset_months(month, 1, shift_backwards=False)
        time_range_df = run_query("SELECT * FROM time_range")
        time_range_result = time_range_df.rdd.map(list).collect()
        self.assertListEqual(time_range_result, time_range)

        mock_get_latest_listen_ts.assert_called_once()
        mock_get_listens.assert_called_with(from_date, to_date,
                                            LISTENBRAINZ_DATA_DIRECTORY)
        mock_df.createOrReplaceTempView.assert_called_with('listens')
        mock_create_messages.assert_called_with(
            data='listening_activity_table',
            stats_range='year',
            from_ts=from_date.timestamp(),
            to_ts=to_date.timestamp())
Exemple #3
0
def get_top_artists(user_name):
    """ Prepare dataframe of top y (limit) artists listened to by the user where y
        is a config value.

        Args:
            user_name (str): User name of the user.

        Returns:
            top_artists_df (dataframe): Columns can be depicted as:
                [
                    'user_name', 'artist_name', 'artist_msid', 'count'
                ]
    """
    top_artists_df = run_query("""
            SELECT user_name
                 , artist_name
                 , artist_msid
                 , count(artist_msid) as count
              FROM listens_df
          GROUP BY user_name, artist_name, artist_msid
            HAVING user_name = "%s"
          ORDER BY count DESC
             LIMIT %s
    """ % (user_name, config.TOP_ARTISTS_LIMIT))
    return top_artists_df
    def test_get_listening_activity_week(self, mock_create_messages,
                                         mock_get_listening_activity,
                                         mock_get_listens,
                                         mock_get_latest_listen_ts):
        mock_df = MagicMock()
        mock_get_listens.return_value = mock_df

        listening_activity_stats.get_listening_activity_week()
        to_date = datetime(2020, 9, 10)
        from_date = day = datetime(2020, 8, 31)

        time_range = []
        while day < to_date:
            time_range.append(
                [day.strftime('%A %d %B %Y'), day,
                 get_day_end(day)])
            day = offset_days(day, 1, shift_backwards=False)
        time_range_df = run_query("SELECT * FROM time_range")
        time_range_result = time_range_df.rdd.map(list).collect()
        self.assertListEqual(time_range_result, time_range)

        mock_get_latest_listen_ts.assert_called_once()
        mock_get_listens.assert_called_with(from_date, to_date,
                                            LISTENBRAINZ_DATA_DIRECTORY)
        mock_df.createOrReplaceTempView.assert_called_with('listens')
        mock_create_messages.assert_called_with(
            data='listening_activity_table',
            stats_range='week',
            from_ts=from_date.timestamp(),
            to_ts=to_date.timestamp())
def get_day_of_week(year):
    setup_listens_for_year(year)
    query = """
        WITH listen_weekday AS (
              SELECT user_id
                   , date_format(listened_at, 'EEEE') AS weekday
                   , count(*) AS listen_count
                FROM listens_of_year
            GROUP BY user_id
                   , weekday
        ), top_listen_weekday AS (
              SELECT user_id
                   , weekday
                   , listen_count
                   , row_number() OVER(PARTITION BY user_id ORDER BY listen_count DESC) AS row_number
                FROM listen_weekday
        )
        SELECT to_json(
                    map_from_entries(
                        collect_list(
                            struct(user_id, weekday)
                        )
                    )
                ) AS all_users_weekday
          FROM top_listen_weekday
         WHERE row_number = 1
    """
    data = run_query(query).collect()
    yield {"type": "day_of_week", "data": data[0]["all_users_weekday"]}
def get_playcounts_data(listens_df, users_df, recordings_df):
    """ Prepare playcounts dataframe by joining listens_df, users_df and
        recordings_df to select distinct tracks that a user has listened to
        for all the users along with listen count.

        Args:
            listens_df: Listens dataframe.
            users_df: Users dataframe.
            recordings_df: Recordings dataframe.

        Returns:
            playcounts_df: playcounts dataframe with columns as:
                ['user_id', 'recording_id', 'count']
    """
    listens_df.createOrReplaceTempView('listen')
    users_df.createOrReplaceTempView('user')
    recordings_df.createOrReplaceTempView('recording')
    playcounts_df = run_query("""
        SELECT user_id,
               recording_id,
               count(recording_id) as count
          FROM listen
    INNER JOIN user
            ON listen.user_name = user.user_name
    INNER JOIN recording
            ON recording.recording_msid = listen.recording_msid
      GROUP BY user_id, recording_id
      ORDER BY user_id
    """)
    return playcounts_df
Exemple #7
0
def _create_mapped_dataframe():
    """ Use MSID-MBID mapping to improve the data accuracy and quality

        Returns:
            mapped_df (dataframe): A DataFrame with mapped data
    """
    # Read the mapped data into dataframe with the needed columns
    mapping_df = read_files_from_HDFS(MBID_MSID_MAPPING).select(
        'mb_artist_credit_name', 'mb_artist_credit_mbids', 'msb_artist_msid')
    mapping_df.createOrReplaceTempView('mapping')

    mapped_df = run_query("""
                SELECT CASE
                         WHEN isnull(mb_artist_credit_name) THEN artist_name
                         ELSE mb_artist_credit_name
                       END as artist_name
                     , CASE
                         WHEN isnull(mb_artist_credit_mbids) THEN artist_mbids
                         ELSE mb_artist_credit_mbids
                       END as artist_mbids
                     , CASE
                         WHEN isnull(mb_artist_credit_mbids) AND cardinality(artist_mbids) == 0 THEN nullif(artist_msid, "")
                         ELSE NULL
                       END as artist_msid
                     , listened_at
                  FROM listens
             LEFT JOIN mapping
                    ON listens.artist_msid == mapping.msb_artist_msid
                    """)

    return mapped_df
    def test_get_daily_activity(self):
        data = self._create_listens_table()
        expected = self._calculate_expected(data)

        received = {}
        result = daily_activity_stats.get_daily_activity()
        for entry in result:
            _dict = entry.asDict(recursive=True)
            received[_dict['user_name']] = _dict['daily_activity']

        weekdays = [calendar.day_name[day] for day in range(0, 7)]
        hours = [hour for hour in range(0, 24)]
        time_range = itertools.product(weekdays, hours)
        time_range_df_expected = listenbrainz_spark.session.createDataFrame(
            time_range, schema=["day", "hour"]).toLocalIterator()
        time_range_expected = [
            entry.asDict(recursive=True) for entry in time_range_df_expected
        ]

        time_range_df_received = run_query(
            "SELECT * FROM time_range").toLocalIterator()
        time_range_received = [
            entry.asDict(recursive=True) for entry in time_range_df_received
        ]

        self.assertListEqual(time_range_expected, time_range_received)
        self.assertDictEqual(expected, received)
Exemple #9
0
def get_artists(table: str, limit: int = SITEWIDE_STATS_ENTITY_LIMIT):
    """ Get artist information (artist_name, artist_msid etc) for every time range specified
        the "time_range" table ordered by listen count

        Args:
            table: Name of the temporary table.
            limit: number of top artists to retain
        Returns:
            iterator (iter): An iterator over result
    """

    result = run_query(f"""
        WITH intermediate_table as (
            SELECT artist_name
                 , artist_credit_mbids
                 , count(*) as listen_count
              FROM {table}
          GROUP BY artist_name
                 , artist_credit_mbids
          ORDER BY listen_count DESC
             LIMIT {limit}
        )
        SELECT collect_list(
                    struct(
                        artist_name
                      , coalesce(artist_credit_mbids, array()) AS artist_mbids
                      , listen_count
                    )
               ) AS stats
          FROM intermediate_table
    """)

    return result.toLocalIterator()
    def test_get_entity_week(self, mock_create_message, mock_filter_listens,
                             mock_get_listens, mock_get_latest_listen_ts):
        mock_df = MagicMock()
        mock_get_listens.return_value = mock_df
        mock_filtered_df = MagicMock()
        mock_filter_listens.return_value = mock_filtered_df

        entity_stats.get_entity_week('test', False)
        from_date = datetime(2020, 8, 3)
        to_date = datetime(2020, 8, 17)
        day = from_date

        time_range = []
        while day < to_date:
            time_range.append([day.strftime('%A %d %B %Y'), int(day.timestamp()), int(get_day_end(day).timestamp())])
            day = offset_days(day, 1, shift_backwards=False)
        time_range_df = run_query("SELECT * FROM time_range")
        time_range_result = time_range_df.rdd.map(list).collect()
        self.assertListEqual(time_range_result, time_range)

        mock_get_latest_listen_ts.assert_called_once()
        mock_get_listens.assert_called_with(from_date, to_date, LISTENBRAINZ_DATA_DIRECTORY)
        mock_filter_listens.assert_called_with(mock_df, from_date, to_date)
        mock_filtered_df.createOrReplaceTempView.assert_called_with('sitewide_test_week')
        mock_create_message.assert_called_with(data='sitewide_test_week_data', entity='test', stats_range='week',
                                               from_ts=from_date.timestamp(), to_ts=to_date.timestamp())
def get_listen_count(year):
    setup_listens_for_year(year)
    data = run_query(_get_yearly_listen_counts()).collect()
    yield {
        "type": "year_in_music_listen_count",
        "data": data[0]["yearly_listen_counts"]
    }
Exemple #12
0
def prepare_recording_data(table):
    """ Prepare recordings dataframe to select distinct recordings/tracks
        listened to and assign each recording a unique integer id.

        Args:
            table (str): Registered dataframe to run SQL queries.

        Returns:
            recordings_df (dataframe): Columns can be depicted as:
                [
                    'track_name', 'recording_msid', 'artist_name', 'artist_msid',
                    'release_name', 'release_msid', 'recording_id'
                ]
    """
    recordings_df = run_query("""
            SELECT track_name
                 , recording_msid
                 , artist_name
                 , artist_msid
                 , release_name
                 , release_msid
                 , row_number() OVER (ORDER BY 'recording_msid') AS recording_id
              FROM (SELECT DISTINCT recording_msid, track_name, artist_name, artist_msid,
                    release_name, release_msid FROM %s)
        """ % (table))
    return recordings_df
 def test_run_query(self):
     df = utils.create_dataframe([Row(column1=1, column2=2)], schema=None)
     utils.register_dataframe(df, "table")
     new_df = stats.run_query("""
      SELECT *
       FROM table
       """)
     self.assertEqual(new_df.count(), df.count())
Exemple #14
0
def get_most_prominent_color(year):
    setup_listens_for_year(year)
    _get_release_colors().createOrReplaceTempView("release_color")
    all_user_colors = run_query(_get_most_prominent_color()).collect()
    yield {
        "type": "most_prominent_color",
        "data": all_user_colors[0]["all_users_colors"]
    }
Exemple #15
0
def get_most_listened_year(year):
    setup_listens_for_year(year)
    setup_all_releases()
    data = run_query(_get_releases_with_date()).collect()
    yield {
        "type": "most_listened_year",
        "data": data[0]["all_user_yearly_counts"]
    }
def get_releases(table):
    """
    Get release information (release_name, release_mbid etc) for every user
    ordered by listen count (number of times a user has listened to tracks
    which belong to a particular release).

    Args:
        table: name of the temporary table

    Returns:
        iterator (iter): an iterator over result
                {
                    'user1' : [{
                        'release_name': str
                        'release_msid': str,
                        'release_mbid': str,
                        'artist_name': str,
                        'artist_msid': str,
                        'artist_mbids': list(str),
                        'listen_count': int
                    }],
                    'user2' : [{...}],
                }
    """
    result = run_query(f"""
        WITH intermediate_table as (
            SELECT user_id
                , first(release_name) AS any_release_name
                , release_mbid
                , first(artist_name) AS any_artist_name
                , artist_credit_mbids
                , count(*) as listen_count
              FROM {table}
             WHERE release_name != ''
          GROUP BY user_id
                , lower(release_name)
                , release_mbid
                , lower(artist_name)
                , artist_credit_mbids
        )
        SELECT user_id
             , sort_array(
                    collect_list(
                        struct(
                            listen_count
                          , any_release_name AS release_name
                          , release_mbid
                          , any_artist_name AS artist_name
                          , coalesce(artist_credit_mbids, array()) AS artist_mbids
                        )
                    )
                   , false
                ) as releases
          FROM intermediate_table
      GROUP BY user_id
        """)

    return result.toLocalIterator()
def get_releases(table: str, limit: int = SITEWIDE_STATS_ENTITY_LIMIT):
    """
    Get release information (release_name, release_mbid etc) ordered by
    listen count (number of times listened to tracks which belong to a
    particular release).

    Args:
        table: name of the temporary table
        limit: number of top releases to retain
    Returns:
        iterator: an iterator over result, contains only 1 row
                {
                    [
                        {
                            'release_name': str
                            'release_msid': str,
                            'release_mbid': str,
                            'artist_name': str,
                            'artist_msid': str,
                            'artist_mbids': list(str),
                            'listen_count': int
                        },
                        ...
                    ],
                }
    """
    result = run_query(f"""
        WITH intermediate_table as (
            SELECT first(release_name) AS any_release_name
                 , release_mbid
                 , first(artist_name) AS any_artist_name
                 , artist_credit_mbids
                 , count(*) as listen_count
              FROM {table}
             WHERE release_name != ''
          GROUP BY lower(release_name)
                 , release_mbid
                 , lower(artist_name)
                 , artist_credit_mbids
          ORDER BY listen_count DESC
             LIMIT {limit}
        )
        SELECT sort_array(
                    collect_list(
                        struct(
                            listen_count
                          , any_release_name AS release_name
                          , release_mbid
                          , any_artist_name AS artist_name
                          , coalesce(artist_credit_mbids, array()) AS artist_mbids
                        )
                    )
                   , false
                ) as stats
          FROM intermediate_table
        """)

    return result.toLocalIterator()
def calculate_listening_activity():
    """ Calculate number of listens for each user in time ranges given in the "time_range" table.
    The time ranges are as follows:
        1) week - each day with weekday name of the past 2 weeks.
        2) month - each day the past 2 months. 
        3) year - each month of the past 2 years.
        4) all_time - each year starting from LAST_FM_FOUNDING_YEAR (2002)
    """
    # Calculate the number of listens in each time range for each user except the time ranges which have zero listens.
    result_without_zero_days = run_query("""
            SELECT listens.user_name
                 , time_range.time_range
                 , count(listens.user_name) as listen_count
              FROM listens
              JOIN time_range
                ON listens.listened_at >= time_range.start
               AND listens.listened_at <= time_range.end
          GROUP BY listens.user_name
                 , time_range.time_range
            """)
    result_without_zero_days.createOrReplaceTempView(
        "result_without_zero_days")

    # Add the time ranges which have zero listens to the previous dataframe
    result = run_query("""
            SELECT dist_user_name.user_name
                 , time_range.time_range
                 , to_unix_timestamp(time_range.start) as from_ts
                 , to_unix_timestamp(time_range.end) as to_ts
                 , ifnull(result_without_zero_days.listen_count, 0) as listen_count
              FROM (SELECT DISTINCT user_name FROM listens) dist_user_name
        CROSS JOIN time_range
         LEFT JOIN result_without_zero_days
                ON result_without_zero_days.user_name = dist_user_name.user_name
               AND result_without_zero_days.time_range = time_range.time_range
            """)

    # Create a table with a list of time ranges and corresponding listen count for each user
    iterator = result \
        .withColumn("listening_activity", struct("from_ts", "to_ts", "listen_count", "time_range")) \
        .groupBy("user_name") \
        .agg(sort_array(collect_list("listening_activity")).alias("listening_activity")) \
        .toLocalIterator()

    return iterator
def calculate_daily_activity():
    """ Calculate number of listens for each user in each hour. """

    # Genarate a dataframe containing hours of all days of the week
    weekdays = [calendar.day_name[day] for day in range(0, 7)]
    hours = [hour for hour in range(0, 24)]
    time_range = itertools.product(weekdays, hours)
    time_range_df = listenbrainz_spark.session.createDataFrame(
        time_range, schema=["day", "hour"])
    time_range_df.createOrReplaceTempView("time_range")

    # Truncate listened_at to day and hour to improve matching speed
    formatted_listens = run_query("""
                            SELECT user_id
                                 , date_format(listened_at, 'EEEE') as day
                                 , date_format(listened_at, 'H') as hour
                              FROM listens
                              """)

    formatted_listens.createOrReplaceTempView("listens")

    # Calculate the number of listens in each time range for each user except the time ranges which have zero listens.
    result = run_query("""
                SELECT listens.user_id
                     , time_range.day
                     , time_range.hour
                     , count(*) as listen_count
                  FROM listens
                  JOIN time_range
                    ON listens.day == time_range.day
                   AND listens.hour == time_range.hour
              GROUP BY listens.user_id
                     , time_range.day
                     , time_range.hour
                  """)

    # Create a table with a list of time ranges and corresponding listen count for each user
    iterator = result \
        .withColumn("daily_activity", struct("hour", "day", "listen_count")) \
        .groupBy("user_id") \
        .agg(sort_array(collect_list("daily_activity")).alias("daily_activity")) \
        .toLocalIterator()

    return iterator
def get_releases(table):
    """
    Get release information (release_name, release_mbid etc) for every user
    ordered by listen count (number of times a user has listened to tracks
    which belong to a particular release).

    Args:
        table: name of the temporary table

    Returns:
        artists: A dict of dicts which can be depicted as:
                {
                    'user1' : [{
                        'release_name': str
                        'release_msid': str,
                        'release_mbid': str,
                        'artist_name': str,
                        'artist_msid': str,
                        'artist_mbids': str,
                        'listen_count': int
                    }],
                    'user2' : [{...}],
                }
    """
    t0 = time.time()
    query = run_query("""
            SELECT user_name
                 , release_name
                 , release_msid
                 , release_mbid
                 , artist_name
                 , artist_msid
                 , artist_mbids
                 , count(release_msid) as cnt
              FROM %s
          GROUP BY user_name, release_name, release_msid, release_mbid, artist_name, artist_msid, artist_mbids
          ORDER BY cnt DESC
        """ % (table))
    rows = query.collect()
    releases = defaultdict(list)
    for row in rows:
        releases[row.user_name].append({
            'release_name': row.release_name,
            'release_msid': row.release_msid,
            'release_mbid': row.release_mbid,
            'artist_name': row.artist_name,
            'artist_msid': row.artist_msid,
            'artist_mbids': row.artist_mbids,
            'listen_count': row.cnt,
        })
    print("Query to calculate release stats processed in %.2f s" %
          (time.time() - t0))
    return releases
Exemple #21
0
def get_recordings(table: str, limit: int = SITEWIDE_STATS_ENTITY_LIMIT):
    """
    Get recordings information (artist_name, artist_msid etc) for every
    time range specified ordered by listen count.

    Args:
        table: Name of the temporary table.
        limit: number of top artists to retain
    Returns:
        iterator (iter): An iterator over result
    """
    # we sort twice, the ORDER BY in CTE sorts to eliminate all
    # but top LIMIT results. collect_list's docs mention that the
    # order of collected results is not guaranteed so sort again
    # with sort_array.
    result = run_query(f"""
        WITH intermediate_table as (
            SELECT first(recording_name) AS any_recording_name
                 , recording_mbid
                 , first(artist_name) AS any_artist_name
                 , artist_credit_mbids
                 , nullif(first(release_name), '') as any_release_name
                 , release_mbid
                 , count(*) as listen_count
              FROM {table}
          GROUP BY lower(recording_name)
                 , recording_mbid
                 , lower(artist_name)
                 , artist_credit_mbids
                 , lower(release_name)
                 , release_mbid
          ORDER BY listen_count DESC
             LIMIT {limit}
        )
        SELECT sort_array(
                    collect_list(
                        struct(
                            listen_count
                          , any_recording_name AS track_name
                          , recording_mbid
                          , any_artist_name AS artist_name
                          , coalesce(artist_credit_mbids, array()) AS artist_mbids
                          , any_release_name AS release_name
                          , release_mbid
                        )
                    )
                   , false
                ) as stats
          FROM intermediate_table
    """)

    return result.toLocalIterator()
Exemple #22
0
def get_listening_activity():
    """ Calculate number of listens for each user in time ranges given in the 'time_range' table """
    # Calculate the number of listens in each time range for each user except the time ranges which have zero listens.
    result_without_zero_days = run_query("""
            SELECT listens.user_name
                 , time_range.time_range
                 , count(listens.user_name) as listen_count
              FROM listens
              JOIN time_range
                ON listens.listened_at >= time_range.start
               AND listens.listened_at <= time_range.end
          GROUP BY listens.user_name
                 , time_range.time_range
            """)
    result_without_zero_days.createOrReplaceTempView('result_without_zero_days')

    # Add the time ranges which have zero listens to the previous dataframe
    result = run_query("""
            SELECT dist_user_name.user_name
                 , time_range.time_range
                 , to_unix_timestamp(time_range.start) as from_ts
                 , to_unix_timestamp(time_range.end) as to_ts
                 , ifnull(result_without_zero_days.listen_count, 0) as listen_count
              FROM (SELECT DISTINCT user_name FROM listens) dist_user_name
        CROSS JOIN time_range
         LEFT JOIN result_without_zero_days
                ON result_without_zero_days.user_name = dist_user_name.user_name
               AND result_without_zero_days.time_range = time_range.time_range
            """)

    # Create a table with a list of time ranges and corresponding listen count for each user
    iterator = result \
        .withColumn("listening_activity", struct("from_ts", "to_ts", "listen_count", "time_range")) \
        .groupBy("user_name") \
        .agg(sort_array(collect_list("listening_activity")).alias("listening_activity")) \
        .toLocalIterator()

    return iterator
Exemple #23
0
def get_top_artists_with_collab():
    """ Prepare dataframe consisting of top artists with non zero collaborations.

        Returns:
            top_artists_with_collab_df (dataframe): Column can be depicted as:
                [
                    'artist_name'
                ]
    """
    top_artists_with_collab_df = run_query("""
        SELECT DISTINCT artist_name
          FROM similar_artist
    """)
    return top_artists_with_collab_df
    def test_get_daily_activity(self):
        received = daily_activity.get_daily_activity('all_time')

        time_range_expected = itertools.product(calendar.day_name,
                                                range(0, 24))
        time_range_received = run_query(
            "SELECT * FROM time_range").toLocalIterator()
        self.assertListEqual(list(time_range_expected),
                             list(time_range_received))

        with open(self.path_to_data_file('user_daily_activity.json')) as f:
            expected = json.load(f)

        self.assertListEqual(expected, list(received))
def get_artists(table: str, user_listen_count_limit, top_artists_limit: int = SITEWIDE_STATS_ENTITY_LIMIT):
    """ Get artist information (artist_name, artist_msid etc) for every time range specified
        the "time_range" table ordered by listen count

        Args:
            table: name of the temporary table
            user_listen_count_limit: per user per entity listen count above which it should be capped
            top_artists_limit: number of top artists to retain
        Returns:
            iterator (iter): An iterator over result
    """
    # we sort twice, the ORDER BY in CTE sorts to eliminate all
    # but top LIMIT results. collect_list's docs mention that the
    # order of collected results is not guaranteed so sort again
    # with sort_array.
    result = run_query(f"""
        WITH user_counts as (
            SELECT user_id
                 , first(artist_name) AS artist_name
                 , artist_credit_mbids
                 , LEAST(count(*), {user_listen_count_limit}) as listen_count
              FROM {table}
          GROUP BY user_id
                 , lower(artist_name)
                 , artist_credit_mbids
        ), intermediate_table AS (
            SELECT first(artist_name) AS artist_name
                 , artist_credit_mbids
                 , SUM(listen_count) as total_listen_count
              FROM user_counts
          GROUP BY lower(artist_name)
                 , artist_credit_mbids
          ORDER BY total_listen_count DESC
             LIMIT {top_artists_limit}
        )
        SELECT sort_array(
                    collect_list(
                        struct(
                            total_listen_count AS listen_count
                          , artist_name
                          , coalesce(artist_credit_mbids, array()) AS artist_mbids
                        )
                    )
                    , false
               ) AS stats
          FROM intermediate_table
    """)

    return result.toLocalIterator()
def get_user_id(user_name):
    """ Get user id using user name.

        Args:
            user_name: Name of the user.

        Returns:
            user_id: User id of the user.
    """
    result = run_query("""
        SELECT user_id
          FROM user
         WHERE user_name = '%s'
    """ % user_name)
    return result.first()['user_id']
Exemple #27
0
def get_latest_listen_ts():
    """ Get the timestamp of the latest timestamp present in spark cluster """
    now = datetime.now()
    while True:
        try:
            df = utils.get_listens(now, now, LISTENBRAINZ_DATA_DIRECTORY)
            break
        except HDFSException:
            now = offset_months(now, 1)

    df.createOrReplaceTempView('latest_listen_ts')
    result = run_query(
        "SELECT MAX(listened_at) as max_timestamp FROM latest_listen_ts")
    rows = result.collect()
    return rows[0]['max_timestamp']
Exemple #28
0
def get_artists(table: str) -> Iterator[UserArtistRecord]:
    """ Get artist information (artist_name, artist_credit_id etc) for every user
        ordered by listen count

        Args:
            table: name of the temporary table.

        Returns:
            iterator (iter): an iterator over result
                    {
                        user1: [
                            {
                                'artist_name': str,
                                'artist_credit_id': int,
                                'listen_count': int
                            }
                        ],
                        user2: [{...}],
                    }
    """

    result = run_query(f"""
        WITH intermediate_table as (
            SELECT user_name
                 , first(artist_name) AS any_artist_name
                 , artist_credit_mbids
                 , count(*) as listen_count
              FROM {table}
          GROUP BY user_name
                 , lower(artist_name)
                 , artist_credit_mbids
        )
        SELECT user_name
             , sort_array(
                    collect_list(
                        struct(
                            listen_count
                          , any_artist_name AS artist_name
                          , coalesce(artist_credit_mbids, array()) AS artist_mbids
                        )
                    )
                    , false
               ) as artists
          FROM intermediate_table
      GROUP BY user_name 
    """)

    return result.toLocalIterator()
Exemple #29
0
    def test_get_daily_activity(self):
        received = list(daily_activity.get_daily_activity('all_time'))

        time_range_expected = itertools.product(calendar.day_name, range(0, 24))
        time_range_received = run_query("SELECT * FROM time_range").toLocalIterator()
        self.assertListEqual(list(time_range_expected), list(time_range_received))

        with open(self.path_to_data_file('user_daily_activity.json')) as f:
            expected = json.load(f)

        self.assertEqual(len(received), len(expected))
        self.assertEqual(received[0]["type"], expected[0]["type"])
        self.assertEqual(received[0]["stats_range"], expected[0]["stats_range"])
        self.assertEqual(received[0]["from_ts"], expected[0]["from_ts"])
        self.assertEqual(received[0]["to_ts"], expected[0]["to_ts"])
        self.assertCountEqual(received[0]["data"], expected[0]["data"])
def get_artists(table):
    """ Get artist information (artist_name, artist_msid etc) for every user
        ordered by listen count

        Args:
            table (str): name of the temporary table.

        Returns:
            iterator (iter): an iterator over result
                    {
                        user1: [{
                            'artist_name': str,
                            'artist_msid': str,
                            'artist_mbids': list(str),
                            'listen_count': int
                        }],
                        user2: [{...}],
                    }
    """

    result = run_query("""
              WITH intermediate_table as (
                SELECT user_name
                     , artist_name
                     , CASE
                         WHEN cardinality(artist_mbids) > 0 THEN NULL
                         ELSE nullif(artist_msid, '')
                       END as artist_msid
                     , artist_mbids
                  FROM {table}
              )
            SELECT *
                 , count(*) as listen_count
              FROM intermediate_table
          GROUP BY user_name
                 , artist_name
                 , artist_msid
                 , artist_mbids
            """.format(table=table))

    iterator = result \
        .withColumn("artists", struct("listen_count", "artist_name", "artist_msid", "artist_mbids")) \
        .groupBy("user_name") \
        .agg(sort_array(collect_list("artists"), asc=False).alias("artists")) \
        .toLocalIterator()

    return iterator