def upload_new_listens_full_dump(self, archive: str): """ Upload new format parquet listens dumps to of a full dump to HDFS. Args: archive: path to parquet listens dump to be uploaded """ src_path = self.upload_archive_to_temp(archive) dest_path = path.LISTENBRAINZ_NEW_DATA_DIRECTORY # Delete existing dumps if any if utils.path_exists(dest_path): logger.info(f'Removing {dest_path} from HDFS...') utils.delete_dir(dest_path, recursive=True) logger.info('Done!') logger.info(f"Moving the processed files from {src_path} to {dest_path}") t0 = time.monotonic() # Check if parent directory exists, if not create a directory dest_path_parent = str(Path(dest_path).parent) if not utils.path_exists(dest_path_parent): utils.create_dir(dest_path_parent) utils.rename(src_path, dest_path) utils.logger.info(f"Done! Time taken: {time.monotonic() - t0:.2f}")
def test_path_exists(self): path_ = '/tests/test' utils.create_dir(path_) status = utils.path_exists(path_) self.assertTrue(status) utils.delete_dir(path_) status = utils.path_exists(path_) self.assertFalse(status)
def test_save_candidate_sets(self): top_artist_candidate_set_df_df = self.get_candidate_set() similar_artist_candidate_set_dfs_df = self.get_candidate_set() candidate_sets.save_candidate_sets(top_artist_candidate_set_df_df, similar_artist_candidate_set_dfs_df) top_artist_exist = utils.path_exists(path.TOP_ARTIST_CANDIDATE_SET) self.assertTrue(top_artist_exist) similar_artist_exist = utils.path_exists(path.SIMILAR_ARTIST_CANDIDATE_SET) self.assertTrue(top_artist_exist)
def test_rename(self): utils.create_dir(self.path_) test_exists = utils.path_exists(self.path_) self.assertTrue(test_exists) utils.rename(self.path_, self.temp_path_) test_exists = utils.path_exists(self.path_) self.assertFalse(test_exists) temp_exists = utils.path_exists(self.temp_path_) self.assertTrue(temp_exists) utils.delete_dir(self.temp_path_)
def delete_model(): """ Delete model. Note: At any point in time, only one model is in HDFS """ dir_exists = utils.path_exists(path.DATA_DIR) if dir_exists: utils.delete_dir(path.DATA_DIR, recursive=True)
def upload_archive_to_temp(self, archive: str) -> str: """ Upload parquet files in archive to a temporary hdfs directory Args: archive: the archive to be uploaded Returns: path of the temp dir where archive has been uploaded Notes: The following dump structure should be ensured for this function to work correctly. Say, the dump is named v-2021-08-15.tar. The tar should contain one top level directory, v-2021-08-15. This directory should contain all the files that need to be uploaded. """ with tempfile.TemporaryDirectory() as local_temp_dir: logger.info("Cleaning HDFS temporary directory...") if utils.path_exists(HDFS_TEMP_DIR): utils.delete_dir(HDFS_TEMP_DIR, recursive=True) logger.info("Uploading listens to temporary directory in HDFS...") self.extract_and_upload_archive(archive, local_temp_dir, HDFS_TEMP_DIR) # dump is uploaded to HDFS_TEMP_DIR/archive_name archive_name = Path(archive).stem return str(Path(HDFS_TEMP_DIR).joinpath(archive_name))
def test_delete_model(self): df = utils.create_dataframe(Row(col1=1, col2=1), None) utils.save_parquet(df, path.RECOMMENDATION_RECORDING_DATA_DIR) train_models.delete_model() dir_exists = utils.path_exists(path.RECOMMENDATION_RECORDING_DATA_DIR) self.assertFalse(dir_exists)
def test_search_dump_file_missing(self): """ Test to ensure 'False' is returned if metadata file is missing. """ path_found = path_exists(self.path_) if path_found: delete_dir(self.path_, recursive=True) self.assertFalse(import_utils.search_dump(1, "full", datetime.fromtimestamp(1)))
def process_json_listens(self, filename, data_dir, tmp_hdfs_path, append, schema): """ Process a file containing listens from the ListenBrainz dump and add listens to appropriate dataframes. Args: filename (str): File name of JSON file. data_dir (str): Dir to save listens to in HDFS as parquet. tmp_hdfs_path (str): HDFS path where listens JSON has been uploaded. append (bool): If true append to end of parquet rather than write. schema: Schema of the listens """ start_time = time.monotonic() df = utils.read_json(tmp_hdfs_path, schema=schema) if filename.split('/')[-1] == 'invalid.json': dest_path = os.path.join(data_dir, 'invalid.parquet') else: year = filename.split('/')[-2] month = filename.split('/')[-1][0:-5] dest_path = os.path.join(data_dir, year, '{}.parquet'.format(str(month))) if append and utils.path_exists(dest_path): utils.save_parquet(df, dest_path, mode="append") else: utils.save_parquet(df, dest_path, mode="overwrite") logger.info("Uploading to {}...".format(dest_path)) logger.info( "File processed in {:.2f} seconds!".format(time.monotonic() - start_time))
def tearDown(self): """ Delete the parquet file stored to ensure that the tests are independant. """ path_found = path_exists(self.path_) if path_found: delete_dir(self.path_, recursive=True) return super().tearDown()
def test_get_latest_full_dump_file_missing(self): """ Test to ensure 'None' is returned if metadata file is missing. """ path_found = path_exists(self.path_) if path_found: delete_dir(self.path_, recursive=True) self.assertIsNone(import_utils.get_latest_full_dump())
def test_get_mapped_artist_and_recording_mbids(self): to_date = get_latest_listen_ts() partial_listen_df = dataframe_utils.get_listens_for_training_model_window(to_date, to_date, self.listens_path) df = utils.read_files_from_HDFS(self.mapping_path) mapping_df = mapping_utils.get_unique_rows_from_mapping(df) mapped_listens_path = '/mapped_listens.parquet' mapped_listens = dataframe_utils.get_mapped_artist_and_recording_mbids(partial_listen_df, mapping_df, mapped_listens_path) self.assertEqual(mapped_listens.count(), 8) cols = [ 'listened_at', 'mb_artist_credit_id', 'mb_artist_credit_mbids', 'mb_recording_mbid', 'mb_release_mbid', 'msb_artist_credit_name_matchable', 'msb_recording_name_matchable', 'user_name' ] self.assertListEqual(sorted(cols), sorted(mapped_listens.columns)) status = utils.path_exists(mapped_listens_path) self.assertTrue(status)
def upload_release_json_dump(self, archive: str): """ Decompress archive and upload artist relation to HDFS. Args: archive: artist relation tar file to upload. """ hdfs_dir = path.MUSICBRAINZ_RELEASE_DUMP hdfs_mbdump_dir = os.path.join( hdfs_dir, "mbdump" ) # release.tar.xz file has actual dump file inside mbdump dir with tarfile.open(name=archive, mode="r:xz" ) as tar, tempfile.TemporaryDirectory() as local_dir: # Remove existing dumps if utils.path_exists(hdfs_dir): utils.delete_dir(hdfs_dir, recursive=True) utils.create_dir(hdfs_dir) for member in tar: t0 = time.monotonic() logger.info(f"Extracting {member.name}") tar.extract(member, path=local_dir) logger.info( f"Done. Total time: {time.monotonic() - t0:.2f} sec") t0 = time.monotonic() logger.info(f"Uploading {member.name}") hdfs_path = os.path.join(hdfs_dir, member.name) local_path = os.path.join(local_dir, member.name) utils.upload_to_HDFS(hdfs_path, local_path) logger.info( f"Done. Total time: {time.monotonic() - t0:.2f} sec")
def test_save_dataframe(self): path_ = '/test_df.parquet' df = utils.create_dataframe(Row(column1=1, column2=2), schema=None) create_dataframes.save_dataframe(df, path_) status = utils.path_exists(path_) self.assertTrue(status)
def test_save_model(self): training_data, validation_data, test_data = super().split_playcounts() best_model, _, best_model_metadata = train_models.train(training_data, validation_data, validation_data.count(), self.ranks, self.lambdas, self.iterations) model_save_path = os.path.join('/test/model', best_model_metadata['model_id']) train_models.save_model(model_save_path, best_model_metadata['model_id'], best_model) model_exist = utils.path_exists(model_save_path) self.assertTrue(model_exist)
def test_upload_to_HDFS(self): temp_file = tempfile.mkdtemp() local_path = os.path.join(temp_file, 'test_file.txt') with open(local_path, 'w') as f: f.write('test file') self.path_ = '/test/upload.parquet' utils.upload_to_HDFS(self.path_, local_path) status = utils.path_exists(self.path_) self.assertTrue(status)
def test_insert_dump_data_file_missing(self): """ Test to ensure a file is created if it is missing. """ path_found = path_exists(self.path_) if path_found: delete_dir(self.path_, recursive=True) self.assertFalse(import_utils.search_dump(1, "full", datetime.fromtimestamp(1))) import_utils.insert_dump_data(1, "full", datetime.fromtimestamp(1)) self.assertTrue(import_utils.search_dump(1, "full", datetime.fromtimestamp(1)))
def test_get_recordings_dataframe(self): metadata = {} mapped_listens = utils.read_files_from_HDFS(self.mapped_listens_path) recordings_df = create_dataframes.get_recordings_df(mapped_listens, metadata) self.assertEqual(recordings_df.count(), 3) self.assertListEqual(sorted(self.get_recordings_df().columns), sorted(recordings_df.columns)) self.assertEqual(metadata['recordings_count'], 3) status = utils.path_exists(path.RECORDINGS_DATAFRAME_PATH) self.assertTrue(status)
def test_save_dataframe_metadata_to_HDFS(self): df_id = "3acb406f-c716-45f8-a8bd-96ca3939c2e5" metadata = self.get_dataframe_metadata(df_id) create_dataframes.save_dataframe_metadata_to_hdfs(metadata, RECOMMENDATION_RECORDING_DATAFRAME_METADATA) status = utils.path_exists(RECOMMENDATION_RECORDING_DATAFRAME_METADATA) self.assertTrue(status) df = utils.read_files_from_HDFS(RECOMMENDATION_RECORDING_DATAFRAME_METADATA) self.assertCountEqual(df.columns, schema.dataframe_metadata_schema.fieldNames())
def test_get_users_dataframe(self): metadata = {} mapped_listens = utils.read_files_from_HDFS(self.mapped_listens_path) users_df = create_dataframes.get_users_dataframe(mapped_listens, metadata) self.assertEqual(users_df.count(), 2) self.assertListEqual(sorted(self.get_users_df().columns), sorted(users_df.columns)) self.assertEqual(metadata['users_count'], users_df.count()) status = utils.path_exists(path.USERS_DATAFRAME_PATH) self.assertTrue(status)
def test_get_users_dataframe(self): metadata = {} mapped_df = utils.read_files_from_HDFS(MAPPED_LISTENS_PATH) users_df = create_dataframes.get_users_dataframe(mapped_df, metadata) self.assertEqual(users_df.count(), 1) self.assertListEqual(['user_name', 'user_id'], users_df.columns) self.assertEqual(metadata['users_count'], users_df.count()) status = utils.path_exists(path.USERS_DATAFRAME_PATH) self.assertTrue(status)
def test_save_dataframe_metadata_to_HDFS(self): df_id = "3acb406f-c716-45f8-a8bd-96ca3939c2e5" metadata = self.get_dataframe_metadata(df_id) create_dataframes.save_dataframe_metadata_to_hdfs(metadata) status = utils.path_exists(path.DATAFRAME_METADATA) self.assertTrue(status) df = utils.read_files_from_HDFS(path.DATAFRAME_METADATA) self.assertTrue(sorted(df.columns), sorted(schema.dataframe_metadata_schema.fieldNames()))
def test_get_recordings_dataframe(self): metadata = {} mapped_listens = utils.read_files_from_HDFS(RECOMMENDATION_RECORDING_MAPPED_LISTENS) recordings_df = create_dataframes.get_recordings_df(mapped_listens, metadata, RECOMMENDATION_RECORDINGS_DATAFRAME) self.assertEqual(recordings_df.count(), 20) self.assertCountEqual(['artist_credit_id', 'recording_id', 'recording_mbid'], recordings_df.columns) self.assertEqual(metadata['recordings_count'], 20) status = utils.path_exists(RECOMMENDATION_RECORDINGS_DATAFRAME) self.assertTrue(status)
def test_save_model_metadata_to_hdfs(self): model_id = "3acb406f-c716-45f8-a8bd-96ca3939c2e5" metadata = self.get_model_metadata(model_id) train_models.save_model_metadata_to_hdfs(metadata) status = utils.path_exists(path.MODEL_METADATA) self.assertTrue(status) df = utils.read_files_from_HDFS(path.MODEL_METADATA) self.assertTrue(sorted(df.columns), sorted(schema.model_metadata_schema.fieldNames()))
def test_upload_archive_failed(self): faulty_tar = MagicMock() faulty_tar.extract.side_effect = tarfile.ReadError() member = MagicMock() faulty_tar.__iter__.return_value = [member] tmp_dump_dir = tempfile.mkdtemp() self.assertRaises(DumpInvalidException, self.uploader.upload_archive, tmp_dump_dir, faulty_tar, '/test', schema.artist_relation_schema, self.uploader.process_json) status = utils.path_exists('/test') self.assertFalse(status)
def test_get_mapped_artist_and_recording_mbids(self): partial_listen_df = create_dataframes.get_listens_for_training_model_window( self.date, self.date, {}, self.listens_path) mapping_df = utils.read_files_from_HDFS(self.mapping_path) mapped_listens = create_dataframes.get_mapped_artist_and_recording_mbids( partial_listen_df, mapping_df) self.assertEqual(mapped_listens.count(), 1) self.assertListEqual(sorted(self.get_mapped_listens().columns), sorted(mapped_listens.columns)) status = utils.path_exists(path.MAPPED_LISTENS) self.assertTrue(status)
def test_get_recordings_dataframe(self): metadata = {} mapped_df = utils.read_files_from_HDFS(MAPPED_LISTENS_PATH) recordings_df = create_dataframes.get_recordings_df( mapped_df, metadata) self.assertEqual(recordings_df.count(), 1) self.assertListEqual( ['mb_recording_mbid', 'mb_artist_credit_id', 'recording_id'], recordings_df.columns) self.assertEqual(metadata['recordings_count'], 1) status = utils.path_exists(path.RECORDINGS_DATAFRAME_PATH) self.assertTrue(status)
def test_save_dataframe_metadata_to_HDFS(self): metadata = { 'from_date': self.date, 'to_date': self.date, 'listens_count': 1, 'model_id': '1', 'playcounts_count': 1, 'recordings_count': 1, 'updated': True, 'users_count': 1 } create_dataframes.save_dataframe_metadata_to_HDFS(metadata) status = utils.path_exists(path.MODEL_METADATA) self.assertTrue(status)
def test_upload_archive(self): archive_path = self.create_test_tar() pxz = self.uploader.get_pxz_output(archive_path) tmp_dump_dir = tempfile.mkdtemp() with tarfile.open(fileobj=pxz.stdout, mode='r|') as tar: self.uploader.upload_archive(tmp_dump_dir, tar, '/artist_relations.parquet', schema.artist_relation_schema, self.uploader.process_json) df = utils.read_files_from_HDFS('/artist_relations.parquet') self.assertEqual(df.count(), 1) status = utils.path_exists(tmp_dump_dir) self.assertFalse(status) utils.delete_dir('/artist_relations.parquet', recursive=True)
def test_get_playcounts_df(self): metadata = {} mapped_listens = utils.read_files_from_HDFS(self.mapped_listens_path) users_df = create_dataframes.get_users_dataframe(mapped_listens, {}) recordings_df = create_dataframes.get_recordings_df(mapped_listens, {}) listens_df = create_dataframes.get_listens_df(mapped_listens, {}) playcounts_df = create_dataframes.get_playcounts_df( listens_df, recordings_df, users_df, metadata) self.assertEqual(playcounts_df.count(), 1) self.assertListEqual(['user_id', 'recording_id', 'count'], playcounts_df.columns) self.assertEqual(metadata['playcounts_count'], playcounts_df.count()) status = utils.path_exists(path.PLAYCOUNTS_DATAFRAME_PATH) self.assertTrue(status)