コード例 #1
0
def get_listens_for_training_model_window(to_date, from_date, metadata,
                                          dest_path):
    """  Prepare dataframe of listens of X days to train.

        Args:
            from_date (datetime): Date from which start fetching listens.
            to_date (datetime): Date upto which fetch listens.
            dest_path (str): HDFS path.

        Returns:
            partial_listens_df (dataframe): listens without artist mbid and recording mbid.
    """
    metadata['to_date'] = to_date
    metadata['from_date'] = from_date
    try:
        training_df = utils.get_listens(from_date, to_date, dest_path)
    except ValueError as err:
        current_app.logger.error(str(err), exc_info=True)
        raise
    except FileNotFetchedException as err:
        current_app.logger.error(str(err), exc_info=True)
        raise

    partial_listens_df = utils.get_listens_without_artist_and_recording_mbids(
        training_df)
    return convert_text_fields_to_matchable(partial_listens_df)
コード例 #2
0
def get_listens_for_rec_generation_window():
    """ Prepare dataframe of listens of X days to generate recommendations. Here X is a config value.

        Returns:
            df (dataframe): Columns can de depicted as:
                [
                    artist_mbids, artist_msid, artist_name, listened_at, recording_mbid,
                    recording_msid, release_mbid, release_msid, release_name, tags, track_name, user_name
                ]
    """
    to_date = datetime.utcnow()
    from_date = stats.adjust_days(to_date,
                                  config.RECOMMENDATION_GENERATION_WINDOW)
    # shift to the first of the month
    from_date = stats.replace_days(from_date, 1)
    try:
        df = utils.get_listens(
            from_date, to_date,
            config.HDFS_CLUSTER_URI + path.LISTENBRAINZ_DATA_DIRECTORY)
    except ValueError as err:
        current_app.logger.error(str(err), exc_info=True)
        sys.exit(-1)
    except FileNotFetchedException as err:
        current_app.logger.error(str(err), exc_info=True)
        sys.exit(-1)
    return df
コード例 #3
0
def get_entity_all_time(
        entity: str,
        use_mapping: bool = False
) -> Optional[List[SitewideEntityStatMessage]]:
    """ Get the all_time sitewide top entity """
    logger.debug("Calculating sitewide_{}_all_time...".format(entity))

    to_date = get_latest_listen_ts()
    from_date = datetime(LAST_FM_FOUNDING_YEAR, 1, 1)

    # Generate a dataframe containing years from "from_date" to "to_date"
    time_range = [[
        str(year),
        int(datetime(year, 1, 1).timestamp()),
        int(get_year_end(year).timestamp())
    ] for year in range(from_date.year, to_date.year + 1)]
    time_range_df = listenbrainz_spark.session.createDataFrame(
        time_range, schema=time_range_schema)
    time_range_df.createOrReplaceTempView('time_range')

    listens_df = get_listens(from_date, to_date, LISTENBRAINZ_DATA_DIRECTORY)
    table_name = 'sitewide_{}_all_time'.format(entity)
    listens_df.createOrReplaceTempView(table_name)

    handler = entity_handler_map[entity]
    data = handler(table_name, "yyyy", use_mapping)
    message = create_message(data=data,
                             entity=entity,
                             stats_range='all_time',
                             from_ts=from_date.timestamp(),
                             to_ts=to_date.timestamp())

    logger.debug("Done!")

    return message
コード例 #4
0
def get_listens_for_training_model_window(to_date, from_date, metadata,
                                          dest_path):
    """  Prepare dataframe of listens of X days to train. Here X is a config value.

        Args:
            from_date (datetime): Date from which start fetching listens.
            to_date (datetime): Date upto which fetch listens.
            dest_path (str): HDFS path.

        Returns:
            A dataframe with columns as:
                [
                    artist_msid, artist_name, listened_at, recording_msid, release_mbid,
                    release_msid, release_name, tags, track_name, user_name
                ]
    """
    metadata['to_date'] = to_date
    metadata['from_date'] = from_date
    try:
        training_df = utils.get_listens(from_date, to_date, dest_path)
    except ValueError as err:
        current_app.logger.error(str(err), exc_info=True)
        sys.exit(-1)
    except FileNotFetchedException as err:
        current_app.logger.error(str(err), exc_info=True)
        sys.exit(-1)
    return utils.get_listens_without_artist_and_recording_mbids(training_df)
コード例 #5
0
def get_listens_for_training_model_window(metadata):
    """  Prepare dataframe of listens of X days to train. Here X is a config value.

        Returns:
            training_df (dataframe): Columns can de depicted as:
                [
                    artist_mbids, artist_msid, artist_name, listened_at, recording_mbid,
                    recording_msid, release_mbid, release_msid, release_name, tags, track_name, user_name
                ]
    """
    to_date = datetime.utcnow()
    from_date = stats.adjust_days(to_date, config.TRAIN_MODEL_WINDOW)
    # shift to the first of the month
    from_date = stats.replace_days(from_date, 1)

    metadata['to_date'] = to_date
    metadata['from_date'] = from_date
    try:
        training_df = utils.get_listens(
            from_date, to_date,
            config.HDFS_CLUSTER_URI + path.LISTENBRAINZ_DATA_DIRECTORY)
    except ValueError as err:
        current_app.logger.error(str(err), exc_info=True)
        sys.exit(-1)
    except FileNotFetchedException as err:
        current_app.logger.error(str(err), exc_info=True)
        sys.exit(-1)
    return training_df
コード例 #6
0
    def test_get_releases_empty(self):
        self.save_dataframe('user_top_releases_empty.json')
        df = utils.get_listens(datetime.now(), datetime.now(), self.path_)
        df.createOrReplaceTempView('test_view')

        with open(self.path_to_data_file('user_top_releases.json')) as f:
            data = json.load(f)

        received = defaultdict(list)
        data = release_stats.get_releases('test_view')
        for entry in data:
            _dict = entry.asDict(recursive=True)
            received[_dict['user_name']] = _dict['releases']

        self.assertDictEqual(received, {})
コード例 #7
0
def get_latest_listen_ts():
    """ Get the timestamp of the latest timestamp present in spark cluster """
    now = datetime.now()
    while True:
        try:
            df = utils.get_listens(now, now, LISTENBRAINZ_DATA_DIRECTORY)
            break
        except HDFSException:
            now = offset_months(now, 1)

    df.createOrReplaceTempView('latest_listen_ts')
    result = run_query(
        "SELECT MAX(listened_at) as max_timestamp FROM latest_listen_ts")
    rows = result.collect()
    return rows[0]['max_timestamp']
コード例 #8
0
    def test_get_listens(self):
        from_date = datetime(2019, 10, 1)
        to_date = datetime(2019, 11, 1)

        df = utils.create_dataframe([Row(column1=1, column2=2)], schema=None)
        dest_path = self.path_ + '/{}/{}.parquet'.format(
            from_date.year, from_date.month)
        utils.save_parquet(df, dest_path)

        df = utils.create_dataframe([Row(column1=3, column2=4)], schema=None)
        dest_path = self.path_ + '/{}/{}.parquet'.format(
            to_date.year, to_date.month)
        utils.save_parquet(df, dest_path)

        received_df = utils.get_listens(from_date, to_date, self.path_)
        self.assertEqual(received_df.count(), 2)
コード例 #9
0
    def test_get_listens(self):
        from_date = datetime(2019, 10, 1)
        to_date = datetime(2019, 11, 1)
        path_ = 'test_df'
        hdfs_path = os.path.join(config.HDFS_CLUSTER_URI, path_)

        df = utils.create_dataframe(Row(column1=1, column2=2), schema=None)
        dest_path = hdfs_path + '/{}/{}.parquet'.format(from_date.year, from_date.month)
        utils.save_parquet(df, dest_path)

        df = utils.create_dataframe(Row(column1=3, column2=4), schema=None)
        dest_path = hdfs_path + '/{}/{}.parquet'.format(to_date.year, to_date.month)
        utils.save_parquet(df, dest_path)

        received_df = utils.get_listens(from_date, to_date, hdfs_path)
        self.assertEqual(received_df.count(), 2)
コード例 #10
0
def get_entity_week(
        entity: str,
        use_mapping: bool = False
) -> Optional[List[SitewideEntityStatMessage]]:
    """ Get the weekly sitewide top entity """
    current_app.logger.debug("Calculating sitewide_{}_week...".format(entity))

    date = get_latest_listen_ts()

    to_date = get_last_monday(date)
    # Set time to 00:00
    to_date = datetime(to_date.year, to_date.month, to_date.day)
    from_date = offset_days(to_date, 14)
    day = from_date

    # Genarate a dataframe containing days of last and current week along with start and end time
    time_range = []
    while day < to_date:
        time_range.append([
            day.strftime('%A %d %B %Y'),
            int(day.timestamp()),
            int(get_day_end(day).timestamp())
        ])
        day = offset_days(day, 1, shift_backwards=False)

    time_range_df = listenbrainz_spark.session.createDataFrame(
        time_range, schema=time_range_schema)
    time_range_df.createOrReplaceTempView('time_range')

    listens_df = get_listens(from_date, to_date, LISTENBRAINZ_DATA_DIRECTORY)
    filtered_df = filter_listens(listens_df, from_date, to_date)
    table_name = 'sitewide_{}_week'.format(entity)
    filtered_df.createOrReplaceTempView(table_name)

    handler = entity_handler_map[entity]
    data = handler(table_name, "EEEE dd MMMM yyyy", use_mapping)
    message = create_message(data=data,
                             entity=entity,
                             stats_range='week',
                             from_ts=from_date.timestamp(),
                             to_ts=to_date.timestamp())

    current_app.logger.debug("Done!")

    return message
コード例 #11
0
    def test_get_artists(self):
        self.save_dataframe()
        df = utils.get_listens(datetime.now(), datetime.now(), self.path_)
        df.createOrReplaceTempView('test_view')

        with open(self.path_to_data_file('user_top_artists.json')) as f:
            data = json.load(f)

        with open(self.path_to_data_file('user_top_artists_output.json')) as f:
            expected = json.load(f)

        data = artist_stats.get_artists('test_view')
        received = defaultdict(list)
        for entry in data:
            _dict = entry.asDict(recursive=True)
            received[_dict['user_name']] = _dict['artists']

        self.assertDictEqual(received, expected)
コード例 #12
0
ファイル: entity.py プロジェクト: mhor/listenbrainz-server
def get_entity_year(entity: str) -> Iterator[Optional[UserEntityStatMessage]]:
    """ Get the year top entity for all users """
    logger.debug("Calculating {}_year...".format(entity))

    to_date = get_latest_listen_ts()
    from_date = replace_days(replace_months(to_date, 1), 1)

    listens_df = get_listens(from_date, to_date, LISTENBRAINZ_DATA_DIRECTORY)
    table_name = 'user_{}_year'.format(entity)
    listens_df.createOrReplaceTempView(table_name)

    handler = entity_handler_map[entity]
    data = handler(table_name)
    messages = create_messages(data=data, entity=entity, stats_range='year',
                               from_ts=from_date.timestamp(), to_ts=to_date.timestamp())

    logger.debug("Done!")

    return messages
コード例 #13
0
ファイル: entity.py プロジェクト: mhor/listenbrainz-server
def get_entity_all_time(entity: str) -> Iterator[Optional[UserEntityStatMessage]]:
    """ Get the all_time top entity for all users """
    logger.debug("Calculating {}_all_time...".format(entity))

    to_date = get_latest_listen_ts()
    from_date = datetime(LAST_FM_FOUNDING_YEAR, 1, 1)

    listens_df = get_listens(from_date, to_date, LISTENBRAINZ_DATA_DIRECTORY)
    table_name = 'user_{}_all_time'.format(entity)
    listens_df.createOrReplaceTempView(table_name)

    handler = entity_handler_map[entity]
    data = handler(table_name)
    messages = create_messages(data=data, entity=entity, stats_range='all_time',
                               from_ts=from_date.timestamp(), to_ts=to_date.timestamp())

    logger.debug("Done!")

    return messages
コード例 #14
0
def calculate():
    now = datetime.utcnow()
    listens_df = get_listens(from_date=datetime(LAST_FM_FOUNDING_YEAR, 1, 1),
                             to_date=now,
                             dest_path=path.LISTENBRAINZ_DATA_DIRECTORY)
    table_name = 'stats_user_all'
    listens_df.createOrReplaceTempView(table_name)

    data = defaultdict(dict)

    # calculate and put artist stats into the result
    artist_data = get_artists(table_name)
    for user_name, user_artists in artist_data.items():
        data[user_name]['artists'] = {
            'artist_stats': user_artists,
            'artist_count': len(user_artists),
        }

    return data
コード例 #15
0
ファイル: all.py プロジェクト: skywinder/listenbrainz-server
def calculate():
    now = datetime.utcnow()
    listens_df = get_listens(from_date=datetime(LAST_FM_FOUNDING_YEAR, 1, 1),
                             to_date=now,
                             dest_path=path.LISTENBRAINZ_DATA_DIRECTORY)
    table_name = 'stats_user_all'
    listens_df.createOrReplaceTempView(table_name)

    # calculate and put artist stats into the result
    artist_data = get_artists(table_name)
    messages = []
    for user_name, user_artists in artist_data.items():
        messages.append({
            'musicbrainz_id': user_name,
            'type': 'user_artist',
            'artist_stats': user_artists,
            'artist_count': len(user_artists),
        })

    return messages
コード例 #16
0
def get_entity_year(
        entity: str,
        use_mapping: bool = False
) -> Optional[List[SitewideEntityStatMessage]]:
    """ Get the yearly sitewide top entity """
    current_app.logger.debug("Calculating sitewide_{}_year...".format(entity))

    to_date = get_latest_listen_ts()
    from_date = datetime(to_date.year - 1, 1, 1)
    month = from_date

    time_range = []
    # Genarate a dataframe containing months of last and current year along with start and end time
    while month < to_date:
        time_range.append([
            month.strftime('%B %Y'),
            int(month.timestamp()),
            int(get_month_end(month).timestamp())
        ])
        month = offset_months(month, 1, shift_backwards=False)

    time_range_df = listenbrainz_spark.session.createDataFrame(
        time_range, schema=time_range_schema)
    time_range_df.createOrReplaceTempView('time_range')

    listens_df = get_listens(from_date, to_date, LISTENBRAINZ_DATA_DIRECTORY)
    table_name = 'sitewide_{}_year'.format(entity)
    listens_df.createOrReplaceTempView(table_name)

    handler = entity_handler_map[entity]
    data = handler(table_name, "MMMM yyyy", use_mapping)
    message = create_message(data=data,
                             entity=entity,
                             stats_range='year',
                             from_ts=from_date.timestamp(),
                             to_ts=to_date.timestamp())

    current_app.logger.debug("Done!")

    return message
コード例 #17
0
ファイル: entity.py プロジェクト: mhor/listenbrainz-server
def get_entity_week(entity: str) -> Iterator[Optional[UserEntityStatMessage]]:
    """ Get the weekly top entity for all users """
    logger.debug("Calculating {}_week...".format(entity))

    date = get_latest_listen_ts()

    to_date = get_last_monday(date)
    from_date = offset_days(to_date, 7)

    listens_df = get_listens(from_date, to_date, LISTENBRAINZ_DATA_DIRECTORY)
    filtered_df = filter_listens(listens_df, from_date, to_date)
    table_name = 'user_{}_week'.format(entity)
    filtered_df.createOrReplaceTempView(table_name)

    handler = entity_handler_map[entity]
    data = handler(table_name)
    messages = create_messages(data=data, entity=entity, stats_range='week',
                               from_ts=from_date.timestamp(), to_ts=to_date.timestamp())

    logger.debug("Done!")

    return messages
コード例 #18
0
def get_listens_for_training_model_window(to_date, from_date, dest_path):
    """  Prepare dataframe of listens to train.

        Args:
            from_date (datetime): Date from which start fetching listens.
            to_date (datetime): Date upto which fetch listens.
            dest_path (str): HDFS path.

        Returns:
            partial_listens_df (dataframe): dataframe of listens.
    """
    try:
        training_df = get_listens(from_date, to_date, dest_path)
    except ValueError as err:
        current_app.logger.error(str(err), exc_info=True)
        raise
    except FileNotFetchedException as err:
        current_app.logger.error(str(err), exc_info=True)
        raise

    partial_listens_df = mapping_utils.convert_text_fields_to_matchable(
        training_df)
    return partial_listens_df
コード例 #19
0
def _get_listens(from_date: datetime, to_date: datetime):
    listens_df = get_listens(from_date, to_date, LISTENBRAINZ_DATA_DIRECTORY)
    listens_df.createOrReplaceTempView('listens')
コード例 #20
0
def _get_listens(from_date: datetime, to_date: datetime):
    listens = get_listens(from_date, to_date, LISTENBRAINZ_DATA_DIRECTORY)
    filtered_listens = filter_listens(listens, from_date, to_date)
    filtered_listens.createOrReplaceTempView('listens')