Beispiel #1
0
    def test_copy(self):
        # Test directories
        utils.create_dir(self.path_)
        utils.create_dir(os.path.join(self.path_, "a"))
        utils.create_dir(os.path.join(self.path_, "b"))

        # DataFrames to create parquets
        df_a = utils.create_dataframe([Row(column1=1, column2=2)], schema=None)
        df_b = utils.create_dataframe([Row(column1=3, column2=4)], schema=None)
        df_c = utils.create_dataframe([Row(column1=5, column2=6)], schema=None)

        # Save DataFrames in respective directories
        utils.save_parquet(df_a, os.path.join(self.path_, "a", "df_a.parquet"))
        utils.save_parquet(df_b, os.path.join(self.path_, "b", "df_b.parquet"))
        utils.save_parquet(df_c, os.path.join(self.path_, "df_c.parquet"))

        utils.copy(self.path_, self.temp_path_, overwrite=True)

        # Read copied DataFrame
        cp_df_a = utils.read_files_from_HDFS(os.path.join(self.temp_path_, "a", "df_a.parquet"))
        cp_df_b = utils.read_files_from_HDFS(os.path.join(self.temp_path_, "b", "df_b.parquet"))
        cp_df_c = utils.read_files_from_HDFS(os.path.join(self.temp_path_, "df_c.parquet"))

        # Check if both DataFrames are same
        self.assertListEqual(df_a.rdd.map(list).collect(), cp_df_a.rdd.map(list).collect())
        self.assertListEqual(df_b.rdd.map(list).collect(), cp_df_b.rdd.map(list).collect())
        self.assertListEqual(df_c.rdd.map(list).collect(), cp_df_c.rdd.map(list).collect())
    def test_get_top_artist_candidate_set(self):
        mapped_listens_df = utils.read_files_from_HDFS(
            self.mapped_listens_path)
        recordings_df = create_dataframes.get_recordings_df(
            mapped_listens_df, {})
        users = create_dataframes.get_users_dataframe(mapped_listens_df, {})
        mapped_listens_subset = utils.read_files_from_HDFS(
            self.mapped_listens_subset_path)

        top_artist_limit = 1
        top_artist_df = candidate_sets.get_top_artists(mapped_listens_subset,
                                                       top_artist_limit, [])

        top_artist_candidate_set_df, top_artist_candidate_set_df_html = candidate_sets.get_top_artist_candidate_set(
            top_artist_df, recordings_df, users, mapped_listens_subset)
        cols = ['recording_id', 'user_id', 'user_name']
        self.assertListEqual(sorted(cols),
                             sorted(top_artist_candidate_set_df.columns))
        self.assertEqual(top_artist_candidate_set_df.count(), 3)

        cols = [
            'top_artist_credit_id', 'top_artist_name', 'mb_artist_credit_id',
            'mb_artist_credit_mbids', 'mb_recording_mbid',
            'msb_artist_credit_name_matchable', 'msb_recording_name_matchable',
            'recording_id', 'user_name', 'user_id'
        ]

        self.assertListEqual(sorted(cols),
                             sorted(top_artist_candidate_set_df_html.columns))
        self.assertEqual(top_artist_candidate_set_df_html.count(), 3)
    def test_filter_last_x_days_recordings(self):
        mapped_listens_df = utils.read_files_from_HDFS(
            self.mapped_listens_path)
        mapped_listens_subset = utils.read_files_from_HDFS(
            self.mapped_listens_subset_path)
        recordings_df = create_dataframes.get_recordings_df(
            mapped_listens_df, {})
        users = create_dataframes.get_users_dataframe(mapped_listens_df, {})
        mapped_listens_subset = utils.read_files_from_HDFS(
            self.mapped_listens_subset_path)

        top_artist_limit = 1
        top_artist_df = candidate_sets.get_top_artists(mapped_listens_subset,
                                                       top_artist_limit, [])

        _, candidate_set_df = candidate_sets.get_top_artist_candidate_set(
            top_artist_df, recordings_df, users, mapped_listens_subset)

        df = candidate_sets.filter_last_x_days_recordings(
            candidate_set_df, mapped_listens_subset)

        user_name = [row.user_name for row in df.collect()]
        self.assertEqual(sorted(user_name), ['rob', 'rob', 'vansika_1'])
        received_recording_mbid = sorted(
            [row.mb_recording_mbid for row in df.collect()])
        expected_recording_mbid = sorted([
            "sf5a56f4-1f83-4681-b319-70a734d0d047",
            "af5a56f4-1f83-4681-b319-70a734d0d047",
            "sf5a56f4-1f83-4681-b319-70a734d0d047"
        ])
        self.assertEqual(expected_recording_mbid, received_recording_mbid)
def main():
    ti = time()
    time_ = defaultdict(dict)
    try:
        listenbrainz_spark.init_spark_session('Recommendations')
    except SparkSessionNotInitializedException as err:
        current_app.logger.error(str(err), exc_info=True)
        sys.exit(-1)

    try:
        users_df = utils.read_files_from_HDFS(path.USERS_DATAFRAME_PATH)
        recordings_df = utils.read_files_from_HDFS(path.RECORDINGS_DATAFRAME_PATH)

        top_artists_candidate_set = utils.read_files_from_HDFS(path.TOP_ARTIST_CANDIDATE_SET)
        similar_artists_candidate_set = utils.read_files_from_HDFS(path.SIMILAR_ARTIST_CANDIDATE_SET)
        mapped_listens = utils.read_files_from_HDFS(path.MAPPED_LISTENS)
    except PathNotFoundException as err:
        current_app.logger.error(str(err), exc_info=True)
        sys.exit(-1)
    except FileNotFetchedException as err:
        current_app.logger.error(str(err), exc_info=True)
        sys.exit(-1)

    metadata_file_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'recommendation-metadata.json')
    with open(metadata_file_path, 'r') as f:
        recommendation_metadata = json.load(f)
        best_model_id = recommendation_metadata['best_model_id']
        user_names = recommendation_metadata['user_name']

    best_model_path = path.DATA_DIR + '/' + best_model_id

    current_app.logger.info('Loading model...')
    t0 = time()
    try:
        model = load_model(config.HDFS_CLUSTER_URI + best_model_path)
    except Py4JJavaError as err:
        current_app.logger.error('Unable to load model "{}"\n{}\nAborting...'.format(best_model_id, str(err.java_exception)),
            exc_info=True)
        sys.exit(-1)
    time_['load_model'] = '{:.2f}'.format((time() - t0) / 60)

    # an action must be called to persist data in memory
    recordings_df.count()
    recordings_df.persist()

    t0 = time()
    recommendations = get_recommendations(user_names, recordings_df, model, users_df, top_artists_candidate_set,
        similar_artists_candidate_set, mapped_listens)
    time_['total_recommendation_time'] = '{:.2f}'.format((time() - t0) / 3600)

    # persisted data must be cleared from memory after usage to avoid OOM
    recordings_df.unpersist()

    if SAVE_RECOMMENDATION_HTML:
        get_recommendation_html(recommendations, time_, best_model_id, ti)
    def test_append_dataframe(self):
        hdfs_path = self.path_ + '/test_df.parquet'
        df = utils.create_dataframe([Row(column1=1, column2=2)], schema=None)
        utils.append(df, hdfs_path)
        new_df = utils.read_files_from_HDFS(hdfs_path)
        self.assertEqual(new_df.count(), 1)

        df = utils.create_dataframe([Row(column1=3, column2=4)], schema=None)
        utils.append(df, hdfs_path)
        appended_df = utils.read_files_from_HDFS(hdfs_path)
        self.assertEqual(appended_df.count(), 2)
    def test_save_playcounts_df(self):
        metadata = {}
        mapped_listens = utils.read_files_from_HDFS(RECOMMENDATION_RECORDING_MAPPED_LISTENS)
        users_df = create_dataframes.get_users_dataframe(mapped_listens, {}, RECOMMENDATION_RECORDING_USERS_DATAFRAME)
        recordings_df = create_dataframes.get_recordings_df(mapped_listens, {}, RECOMMENDATION_RECORDINGS_DATAFRAME)
        listens_df = create_dataframes.get_listens_df(mapped_listens, {})

        create_dataframes.save_playcounts_df(listens_df, recordings_df, users_df, metadata, RECOMMENDATION_RECORDING_PLAYCOUNTS_DATAFRAME)
        playcounts_df = utils.read_files_from_HDFS(RECOMMENDATION_RECORDING_PLAYCOUNTS_DATAFRAME)
        self.assertEqual(playcounts_df.count(), 20)

        self.assertListEqual(['spark_user_id', 'recording_id', 'count'], playcounts_df.columns)
        self.assertEqual(metadata['playcounts_count'], 20)
    def test_save_playcounts_df(self):
        metadata = {}
        mapped_listens = utils.read_files_from_HDFS(self.mapped_listens_path)
        users_df = create_dataframes.get_users_dataframe(mapped_listens, {})
        recordings_df = create_dataframes.get_recordings_df(mapped_listens, {})
        listens_df = create_dataframes.get_listens_df(mapped_listens, {})

        create_dataframes.save_playcounts_df(listens_df, recordings_df, users_df, metadata)
        playcounts_df = utils.read_files_from_HDFS(path.PLAYCOUNTS_DATAFRAME_PATH)
        self.assertEqual(playcounts_df.count(), 5)

        self.assertListEqual(['user_id', 'recording_id', 'count'], playcounts_df.columns)
        self.assertEqual(metadata['playcounts_count'], playcounts_df.count())
Beispiel #8
0
    def test_append_dataframe(self):
        path_ = 'test_df.parquet'
        hdfs_path = os.path.join(config.HDFS_CLUSTER_URI, path_)

        df = utils.create_dataframe(Row(column1=1, column2=2), schema=None)
        utils.append(df, hdfs_path)
        new_df = utils.read_files_from_HDFS(hdfs_path)
        self.assertEqual(new_df.count(), 1)

        df = utils.create_dataframe(Row(column1=3, column2=4), schema=None)
        utils.append(df, hdfs_path)
        appended_df = utils.read_files_from_HDFS(hdfs_path)
        self.assertEqual(appended_df.count(), 2)
    def test_get_top_artists(self):
        mapped_listens = utils.read_files_from_HDFS(self.mapped_listens_path)
        top_artist_limit = 1
        test_top_artist = candidate_sets.get_top_artists(
            mapped_listens, top_artist_limit, [])

        cols = [
            'top_artist_credit_id', 'top_artist_name', 'user_name',
            'total_count'
        ]
        self.assertListEqual(cols, test_top_artist.columns)
        self.assertEqual(test_top_artist.count(), 2)

        top_artist_id = sorted(
            [row.top_artist_credit_id for row in test_top_artist.collect()])
        self.assertEqual(top_artist_id[0], 2)
        self.assertEqual(top_artist_id[1], 2)

        # empty df
        mapped_listens = mapped_listens.select('*').where(
            f.col('user_name') == 'lala')
        with self.assertRaises(TopArtistNotFetchedException):
            candidate_sets.get_top_artists(mapped_listens, top_artist_limit,
                                           [])

        with self.assertRaises(TopArtistNotFetchedException):
            candidate_sets.get_top_artists(mapped_listens, top_artist_limit,
                                           ['lala'])
def main(train_model_window=None):

    ti = time.monotonic()
    # dict to save dataframe metadata which would be later merged in model_metadata dataframe.
    metadata = {}
    # "updated" should always be set to False in this script.
    metadata['updated'] = False
    try:
        listenbrainz_spark.init_spark_session('Create Dataframes')
    except SparkSessionNotInitializedException as err:
        current_app.logger.error(str(err), exc_info=True)
        raise

    current_app.logger.info('Fetching listens to create dataframes...')
    to_date, from_date = get_dates_to_train_data(train_model_window)
    partial_listens_df = get_listens_for_training_model_window(
        to_date, from_date, metadata, path.LISTENBRAINZ_DATA_DIRECTORY)
    current_app.logger.info(
        'Listen count from {from_date} to {to_date}: {listens_count}'.format(
            from_date=from_date,
            to_date=to_date,
            listens_count=partial_listens_df.count()))

    current_app.logger.info('Loading mapping from HDFS...')
    df = utils.read_files_from_HDFS(path.MBID_MSID_MAPPING)
    msid_mbid_mapping_df = get_unique_rows_from_mapping(df)
    current_app.logger.info(
        'Number of distinct rows in the mapping: {}'.format(
            msid_mbid_mapping_df.count()))

    current_app.logger.info('Mapping listens...')
    mapped_listens_df = get_mapped_artist_and_recording_mbids(
        partial_listens_df, msid_mbid_mapping_df)
    current_app.logger.info('Listen count after mapping: {}'.format(
        mapped_listens_df.count()))

    current_app.logger.info('Preparing users data and saving to HDFS...')
    users_df = get_users_dataframe(mapped_listens_df, metadata)

    current_app.logger.info('Preparing recordings data and saving to HDFS...')
    recordings_df = get_recordings_df(mapped_listens_df, metadata)

    current_app.logger.info(
        'Preparing listen data dump and playcounts, saving playcounts to HDFS...'
    )
    listens_df = get_listens_df(mapped_listens_df, metadata)

    save_playcounts_df(listens_df, recordings_df, users_df, metadata)

    generate_dataframe_id(metadata)
    save_dataframe_metadata_to_hdfs(metadata)

    current_app.logger.info('Preparing missing MusicBrainz data...')
    missing_musicbrainz_data_itr = get_data_missing_from_musicbrainz(
        partial_listens_df, msid_mbid_mapping_df)

    messages = prepare_messages(missing_musicbrainz_data_itr, from_date,
                                to_date, ti)

    return messages
Beispiel #11
0
def _create_mapped_dataframe():
    """ Use MSID-MBID mapping to improve the data accuracy and quality

        Returns:
            mapped_df (dataframe): A DataFrame with mapped data
    """
    # Read the mapped data into dataframe with the needed columns
    mapping_df = read_files_from_HDFS(MBID_MSID_MAPPING).select(
        'mb_artist_credit_name', 'mb_artist_credit_mbids', 'msb_artist_msid')
    mapping_df.createOrReplaceTempView('mapping')

    mapped_df = run_query("""
                SELECT CASE
                         WHEN isnull(mb_artist_credit_name) THEN artist_name
                         ELSE mb_artist_credit_name
                       END as artist_name
                     , CASE
                         WHEN isnull(mb_artist_credit_mbids) THEN artist_mbids
                         ELSE mb_artist_credit_mbids
                       END as artist_mbids
                     , CASE
                         WHEN isnull(mb_artist_credit_mbids) AND cardinality(artist_mbids) == 0 THEN nullif(artist_msid, "")
                         ELSE NULL
                       END as artist_msid
                     , listened_at
                  FROM listens
             LEFT JOIN mapping
                    ON listens.artist_msid == mapping.msb_artist_msid
                    """)

    return mapped_df
Beispiel #12
0
def create_mapped_dataframe(listens_df):
    """
    Use artist credit recording pair from MSID-MBID mapping to improve stats quality

    Args:
        listens_df: The DataFrame to perform mapping upon

    Returns:
        result: DataFrame consisting of mapped list of listens
    """
    # Fetch mapping from HDFS
    msid_mbid_mapping_df = utils.get_unique_rows_from_mapping(
        utils.read_files_from_HDFS(MBID_MSID_MAPPING))

    # Create matchable fields from listens table
    matchable_df = utils.convert_text_fields_to_matchable(listens_df)

    join_condition = [
        listens_df.track_name_matchable ==
        msid_mbid_mapping_df.msb_recording_name_matchable,
        listens_df.artist_name_matchable ==
        msid_mbid_mapping_df.msb_artist_credit_name_matchable
    ]
    intermediate_df = listens_df.join(msid_mbid_mapping_df, join_condition,
                                      'left')
 def test_get_listens_df(self):
     metadata = {}
     mapped_listens = utils.read_files_from_HDFS(RECOMMENDATION_RECORDING_MAPPED_LISTENS)
     listens_df = create_dataframes.get_listens_df(mapped_listens, metadata)
     self.assertEqual(listens_df.count(), 24)
     self.assertCountEqual(['recording_mbid', 'user_id'], listens_df.columns)
     self.assertEqual(metadata['listens_count'], 24)
 def test_get_listens_df(self):
     metadata = {}
     mapped_listens = utils.read_files_from_HDFS(self.mapped_listens_path)
     listens_df = create_dataframes.get_listens_df(mapped_listens, metadata)
     self.assertEqual(listens_df.count(), 8)
     self.assertListEqual(['mb_recording_mbid', 'user_name'], listens_df.columns)
     self.assertEqual(metadata['listens_count'], 8)
    def test_get_data_missing_from_musicbrainz(self):
        partial_listen_df = utils.read_files_from_HDFS(
            LISTENBRAINZ_NEW_DATA_DIRECTORY)
        itr = create_dataframes.get_data_missing_from_musicbrainz(
            partial_listen_df)
        messages = create_dataframes.prepare_messages(itr, self.begin_date,
                                                      self.end_date,
                                                      time.monotonic())

        received_first_mssg = messages.pop(0)

        self.assertEqual(received_first_mssg['type'],
                         'cf_recommendations_recording_dataframes')
        self.assertEqual(received_first_mssg['from_date'],
                         str(self.begin_date.strftime('%b %Y')))
        self.assertEqual(received_first_mssg['to_date'],
                         str(self.end_date.strftime('%b %Y')))
        self.assertIsInstance(received_first_mssg['dataframe_upload_time'],
                              str)
        self.assertIsInstance(received_first_mssg['total_time'], str)

        with open(os.path.join(TEST_DATA_PATH,
                               'missing_musicbrainz_data.json')) as f:
            expected_missing_mb_data = json.load(f)
        self.assertEqual(expected_missing_mb_data, messages)
 def test_get_dates_to_generate_candidate_sets(self):
     mapped_df = utils.read_files_from_HDFS(
         RECOMMENDATION_RECORDING_MAPPED_LISTENS)
     from_date, to_date = candidate_sets.get_dates_to_generate_candidate_sets(
         mapped_df, 7)
     self.assertEqual(to_date, datetime(2021, 8, 9, 10, 20, 11))
     self.assertEqual(from_date, datetime(2021, 8, 2))
    def test_get_mapped_artist_and_recording_mbids(self):
        to_date = get_latest_listen_ts()
        partial_listen_df = dataframe_utils.get_listens_for_training_model_window(to_date, to_date, self.listens_path)

        df = utils.read_files_from_HDFS(self.mapping_path)
        mapping_df = mapping_utils.get_unique_rows_from_mapping(df)
        mapped_listens_path = '/mapped_listens.parquet'

        mapped_listens = dataframe_utils.get_mapped_artist_and_recording_mbids(partial_listen_df, mapping_df, mapped_listens_path)
        self.assertEqual(mapped_listens.count(), 8)

        cols = [
            'listened_at',
            'mb_artist_credit_id',
            'mb_artist_credit_mbids',
            'mb_recording_mbid',
            'mb_release_mbid',
            'msb_artist_credit_name_matchable',
            'msb_recording_name_matchable',
            'user_name'
        ]

        self.assertListEqual(sorted(cols), sorted(mapped_listens.columns))
        status = utils.path_exists(mapped_listens_path)
        self.assertTrue(status)
Beispiel #18
0
    def upload_test_mapped_listens_to_hdfs(cls, listens_path, mapping_path, mapped_listens_path):
        partial_listen_df = dataframe_utils.get_listens_for_training_model_window(cls.date, cls.date, listens_path)
        df = utils.read_files_from_HDFS(mapping_path)
        mapping_df = mapping_utils.get_unique_rows_from_mapping(df)

        mapped_listens = dataframe_utils.get_mapped_artist_and_recording_mbids(partial_listen_df, mapping_df)
        utils.save_parquet(mapped_listens, mapped_listens_path)
    def test_import_full_dump_by_id_handler(self, mock_datetime, mock_upload,
                                            mock_download, _):
        mock_src = MagicMock()
        mock_download.return_value = (
            mock_src, 'listenbrainz-spark-dump-202-20200915-180002-full.tar',
            202)
        mock_datetime.utcnow.return_value = datetime(2020, 8, 18)

        messages = import_dump.import_full_dump_by_id_handler(202)
        mock_download.assert_called_once_with(directory=mock.ANY,
                                              dump_type=DumpType.FULL,
                                              listens_dump_id=202)
        mock_upload.assert_called_once_with(mock_src)

        # Check if appropriate entry has been made in the table
        import_meta_df = read_files_from_HDFS(IMPORT_METADATA)
        expected_count = import_meta_df \
            .filter(import_meta_df.imported_at == datetime(2020, 8, 18)) \
            .filter("dump_id == 202 AND dump_type == 'full'") \
            .count()

        self.assertEqual(expected_count, 1)
        self.assertEqual(len(messages), 1)
        self.assertListEqual(
            ['listenbrainz-spark-dump-202-20200915-180002-full.tar'],
            messages[0]['imported_dump'])
Beispiel #20
0
    def test_import_full_dump_handler(self, mock_datetime, mock_temp,
                                      mock_rmtree, mock_upload, mock_download,
                                      mock_ftp_constructor):
        mock_src = MagicMock()
        mock_temp.mkdtemp.return_value = 'best_dir_ever'
        mock_download.return_value = (
            mock_src,
            'listenbrainz-listens-dump-202-20200915-180002-spark-full.tar.xz',
            202)
        mock_datetime.utcnow.return_value = datetime(2020, 8, 18)

        messages = import_newest_full_dump_handler()
        mock_download.assert_called_once_with(directory='best_dir_ever',
                                              dump_type='full',
                                              listens_dump_id=None)
        mock_upload.assert_called_once_with(mock_src, overwrite=True)
        mock_rmtree.assert_called_once_with('best_dir_ever')

        # Check if appropriate entry has been made in the table
        import_meta_df = read_files_from_HDFS(IMPORT_METADATA)
        expected_count = import_meta_df \
            .filter(import_meta_df.imported_at == datetime(2020, 8, 18)) \
            .filter("dump_id == 202 AND dump_type == 'full'") \
            .count()

        self.assertEqual(expected_count, 1)
        self.assertEqual(len(messages), 1)
        self.assertListEqual([
            'listenbrainz-listens-dump-202-20200915-180002-spark-full.tar.xz'
        ], messages[0]['imported_dump'])
Beispiel #21
0
 def test_preprocess_data(self):
     test_playcounts_df = utils.read_files_from_HDFS(TEST_PLAYCOUNTS_PATH)
     training_data, validation_data, test_data = train_models.preprocess_data(
         test_playcounts_df)
     total_playcounts = training_data.count() + validation_data.count(
     ) + test_data.count()
     self.assertEqual(total_playcounts, PLAYCOUNTS_COUNT)
    def upload_test_mapped_listens_to_HDFS(cls):
        partial_listen_df = create_dataframes.get_listens_for_training_model_window(
            cls.date, cls.date, {}, LISTENS_PATH)
        mapping_df = utils.read_files_from_HDFS(MAPPING_PATH)

        mapped_df = create_dataframes.get_mapped_artist_and_recording_mbids(
            partial_listen_df, mapping_df)
        utils.save_parquet(mapped_df, MAPPED_LISTENS_PATH)
Beispiel #23
0
    def upload_test_mapped_listens_to_HDFS(cls):
        partial_listen_df = create_dataframes.get_listens_for_training_model_window(
            cls.date, cls.date, {}, cls.listens_path)
        mapping_df = utils.read_files_from_HDFS(cls.mapping_path)

        mapped_listens = create_dataframes.get_mapped_artist_and_recording_mbids(
            partial_listen_df, mapping_df)
        utils.save_parquet(mapped_listens, cls.mapped_listens_path)
 def test_get_listens_df(self):
     metadata = {}
     mapped_df = utils.read_files_from_HDFS(MAPPED_LISTENS_PATH)
     listens_df = create_dataframes.get_listens_df(mapped_df, metadata)
     self.assertEqual(listens_df.count(), 1)
     self.assertListEqual(['mb_recording_mbid', 'user_name'],
                          listens_df.columns)
     self.assertEqual(metadata['listens_count'], 1)
    def test_create_dataframe(self):
        hdfs_path = self.path_ + '/test_df.parquet'
        df = utils.create_dataframe([Row(column1=1, column2=2)], schema=None)
        self.assertEqual(df.count(), 1)
        utils.save_parquet(df, hdfs_path)

        received_df = utils.read_files_from_HDFS(hdfs_path)
        self.assertEqual(received_df.count(), 1)
 def upload_test_mapping_listens_subset_to_hdfs(cls):
     mapped_df = utils.read_files_from_HDFS(cls.mapped_listens_path)
     from_date = stats.offset_days(cls.date, 4)
     to_date = cls.date
     mapped_listens_subset = candidate_sets.get_listens_to_fetch_top_artists(
         mapped_df, from_date, to_date)
     utils.save_parquet(mapped_listens_subset,
                        cls.mapped_listens_subset_path)
Beispiel #27
0
    def test_save_parquet(self):
        path_ = 'test_df.parquet'
        hdfs_path = os.path.join(config.HDFS_CLUSTER_URI, path_)

        df = utils.create_dataframe(Row(column1=1, column2=2), schema=None)
        utils.save_parquet(df, hdfs_path)

        received_df = utils.read_files_from_HDFS(hdfs_path)
        self.assertEqual(received_df.count(), 1)
def main(max_num_users: int):

    logger.info('Start generating similar user matrix')
    try:
        listenbrainz_spark.init_spark_session('User Similarity')
    except SparkSessionNotInitializedException as err:
        logger.error(str(err), exc_info=True)
        raise

    try:
        playcounts_df = utils.read_files_from_HDFS(
            path.USER_SIMILARITY_PLAYCOUNTS_DATAFRAME)
        users_df = utils.read_files_from_HDFS(
            path.USER_SIMILARITY_USERS_DATAFRAME)
    except PathNotFoundException as err:
        logger.error(str(err), exc_info=True)
        raise
    except FileNotFetchedException as err:
        logger.error(str(err), exc_info=True)
        raise

    vectors_df = get_vectors_df(playcounts_df)

    similarity_matrix = Correlation.corr(
        vectors_df, 'vector', 'pearson').first()['pearson(vector)'].toArray()
    similar_users = threshold_similar_users(similarity_matrix, max_num_users)

    # Due to an unresolved bug in Spark (https://issues.apache.org/jira/browse/SPARK-10925), we cannot join twice on
    # the same dataframe. Hence, we create a modified dataframe with the columns renamed.
    other_users_df = users_df\
        .withColumnRenamed('user_id', 'other_user_id')\
        .withColumnRenamed('user_name', 'other_user_name')

    similar_users_df = listenbrainz_spark.session.createDataFrame(similar_users, ['user_id', 'other_user_id',
        'similarity', 'global_similarity'])\
        .join(users_df, 'user_id', 'inner')\
        .join(other_users_df, 'other_user_id', 'inner')\
        .select('user_name', struct('other_user_name', 'similarity', 'global_similarity').alias('similar_user'))\
        .groupBy('user_name')\
        .agg(collect_list('similar_user').alias('similar_users'))

    logger.info('Finishing generating similar user matrix')

    return create_messages(similar_users_df)
    def test_save_dataframe_metadata_to_HDFS(self):
        df_id = "3acb406f-c716-45f8-a8bd-96ca3939c2e5"
        metadata = self.get_dataframe_metadata(df_id)
        create_dataframes.save_dataframe_metadata_to_hdfs(metadata, RECOMMENDATION_RECORDING_DATAFRAME_METADATA)

        status = utils.path_exists(RECOMMENDATION_RECORDING_DATAFRAME_METADATA)
        self.assertTrue(status)

        df = utils.read_files_from_HDFS(RECOMMENDATION_RECORDING_DATAFRAME_METADATA)
        self.assertCountEqual(df.columns, schema.dataframe_metadata_schema.fieldNames())
    def test_get_users_dataframe(self):
        metadata = {}
        mapped_df = utils.read_files_from_HDFS(MAPPED_LISTENS_PATH)
        users_df = create_dataframes.get_users_dataframe(mapped_df, metadata)
        self.assertEqual(users_df.count(), 1)
        self.assertListEqual(['user_name', 'user_id'], users_df.columns)
        self.assertEqual(metadata['users_count'], users_df.count())

        status = utils.path_exists(path.USERS_DATAFRAME_PATH)
        self.assertTrue(status)