Exemple #1
0
    def upload_new_listens_full_dump(self, archive: str):
        """ Upload new format parquet listens dumps to of a full
        dump to HDFS.

            Args:
                  archive: path to parquet listens dump to be uploaded
        """
        src_path = self.upload_archive_to_temp(archive)
        dest_path = path.LISTENBRAINZ_NEW_DATA_DIRECTORY
        # Delete existing dumps if any
        if utils.path_exists(dest_path):
            logger.info(f'Removing {dest_path} from HDFS...')
            utils.delete_dir(dest_path, recursive=True)
            logger.info('Done!')

        logger.info(f"Moving the processed files from {src_path} to {dest_path}")
        t0 = time.monotonic()

        # Check if parent directory exists, if not create a directory
        dest_path_parent = str(Path(dest_path).parent)
        if not utils.path_exists(dest_path_parent):
            utils.create_dir(dest_path_parent)

        utils.rename(src_path, dest_path)
        utils.logger.info(f"Done! Time taken: {time.monotonic() - t0:.2f}")
Exemple #2
0
    def test_path_exists(self):
        path_ = '/tests/test'
        utils.create_dir(path_)

        status = utils.path_exists(path_)
        self.assertTrue(status)
        utils.delete_dir(path_)
        status = utils.path_exists(path_)
        self.assertFalse(status)
    def test_save_candidate_sets(self):
        top_artist_candidate_set_df_df = self.get_candidate_set()
        similar_artist_candidate_set_dfs_df = self.get_candidate_set()

        candidate_sets.save_candidate_sets(top_artist_candidate_set_df_df, similar_artist_candidate_set_dfs_df)
        top_artist_exist = utils.path_exists(path.TOP_ARTIST_CANDIDATE_SET)
        self.assertTrue(top_artist_exist)

        similar_artist_exist = utils.path_exists(path.SIMILAR_ARTIST_CANDIDATE_SET)
        self.assertTrue(top_artist_exist)
 def test_rename(self):
     utils.create_dir(self.path_)
     test_exists = utils.path_exists(self.path_)
     self.assertTrue(test_exists)
     utils.rename(self.path_, self.temp_path_)
     test_exists = utils.path_exists(self.path_)
     self.assertFalse(test_exists)
     temp_exists = utils.path_exists(self.temp_path_)
     self.assertTrue(temp_exists)
     utils.delete_dir(self.temp_path_)
def delete_model():
    """ Delete model.
        Note: At any point in time, only one model is in HDFS
    """
    dir_exists = utils.path_exists(path.DATA_DIR)
    if dir_exists:
        utils.delete_dir(path.DATA_DIR, recursive=True)
Exemple #6
0
    def upload_archive_to_temp(self, archive: str) -> str:
        """ Upload parquet files in archive to a temporary hdfs directory

            Args:
                archive: the archive to be uploaded
            Returns:
                path of the temp dir where archive has been uploaded
            Notes:
                The following dump structure should be ensured for this
                function to work correctly. Say, the dump is named
                v-2021-08-15.tar. The tar should contain one top level
                directory, v-2021-08-15. This directory should contain
                all the files that need to be uploaded.
        """
        with tempfile.TemporaryDirectory() as local_temp_dir:
            logger.info("Cleaning HDFS temporary directory...")
            if utils.path_exists(HDFS_TEMP_DIR):
                utils.delete_dir(HDFS_TEMP_DIR, recursive=True)

            logger.info("Uploading listens to temporary directory in HDFS...")
            self.extract_and_upload_archive(archive, local_temp_dir, HDFS_TEMP_DIR)

        # dump is uploaded to HDFS_TEMP_DIR/archive_name
        archive_name = Path(archive).stem
        return str(Path(HDFS_TEMP_DIR).joinpath(archive_name))
    def test_delete_model(self):
        df = utils.create_dataframe(Row(col1=1, col2=1), None)
        utils.save_parquet(df, path.RECOMMENDATION_RECORDING_DATA_DIR)
        train_models.delete_model()

        dir_exists = utils.path_exists(path.RECOMMENDATION_RECORDING_DATA_DIR)
        self.assertFalse(dir_exists)
    def test_search_dump_file_missing(self):
        """ Test to ensure 'False' is returned if metadata file is missing. """
        path_found = path_exists(self.path_)
        if path_found:
            delete_dir(self.path_, recursive=True)

        self.assertFalse(import_utils.search_dump(1, "full", datetime.fromtimestamp(1)))
Exemple #9
0
    def process_json_listens(self, filename, data_dir, tmp_hdfs_path, append,
                             schema):
        """ Process a file containing listens from the ListenBrainz dump and add listens to
            appropriate dataframes.

            Args:
                filename (str): File name of JSON file.
                data_dir (str): Dir to save listens to in HDFS as parquet.
                tmp_hdfs_path (str): HDFS path where listens JSON has been uploaded.
                append (bool): If true append to end of parquet rather than write.
                schema: Schema of the listens
        """
        start_time = time.monotonic()
        df = utils.read_json(tmp_hdfs_path, schema=schema)

        if filename.split('/')[-1] == 'invalid.json':
            dest_path = os.path.join(data_dir, 'invalid.parquet')
        else:
            year = filename.split('/')[-2]
            month = filename.split('/')[-1][0:-5]
            dest_path = os.path.join(data_dir, year,
                                     '{}.parquet'.format(str(month)))

        if append and utils.path_exists(dest_path):
            utils.save_parquet(df, dest_path, mode="append")
        else:
            utils.save_parquet(df, dest_path, mode="overwrite")

        logger.info("Uploading to {}...".format(dest_path))
        logger.info(
            "File processed in {:.2f} seconds!".format(time.monotonic() -
                                                       start_time))
    def tearDown(self):
        """ Delete the parquet file stored to ensure that the tests are independant. """
        path_found = path_exists(self.path_)
        if path_found:
            delete_dir(self.path_, recursive=True)

        return super().tearDown()
    def test_get_latest_full_dump_file_missing(self):
        """ Test to ensure 'None' is returned if metadata file is missing. """
        path_found = path_exists(self.path_)
        if path_found:
            delete_dir(self.path_, recursive=True)

        self.assertIsNone(import_utils.get_latest_full_dump())
    def test_get_mapped_artist_and_recording_mbids(self):
        to_date = get_latest_listen_ts()
        partial_listen_df = dataframe_utils.get_listens_for_training_model_window(to_date, to_date, self.listens_path)

        df = utils.read_files_from_HDFS(self.mapping_path)
        mapping_df = mapping_utils.get_unique_rows_from_mapping(df)
        mapped_listens_path = '/mapped_listens.parquet'

        mapped_listens = dataframe_utils.get_mapped_artist_and_recording_mbids(partial_listen_df, mapping_df, mapped_listens_path)
        self.assertEqual(mapped_listens.count(), 8)

        cols = [
            'listened_at',
            'mb_artist_credit_id',
            'mb_artist_credit_mbids',
            'mb_recording_mbid',
            'mb_release_mbid',
            'msb_artist_credit_name_matchable',
            'msb_recording_name_matchable',
            'user_name'
        ]

        self.assertListEqual(sorted(cols), sorted(mapped_listens.columns))
        status = utils.path_exists(mapped_listens_path)
        self.assertTrue(status)
Exemple #13
0
    def upload_release_json_dump(self, archive: str):
        """ Decompress archive and upload artist relation to HDFS.

            Args:
                archive: artist relation tar file to upload.
        """
        hdfs_dir = path.MUSICBRAINZ_RELEASE_DUMP
        hdfs_mbdump_dir = os.path.join(
            hdfs_dir, "mbdump"
        )  # release.tar.xz file has actual dump file inside mbdump dir
        with tarfile.open(name=archive, mode="r:xz"
                          ) as tar, tempfile.TemporaryDirectory() as local_dir:
            # Remove existing dumps
            if utils.path_exists(hdfs_dir):
                utils.delete_dir(hdfs_dir, recursive=True)

            utils.create_dir(hdfs_dir)

            for member in tar:
                t0 = time.monotonic()
                logger.info(f"Extracting {member.name}")
                tar.extract(member, path=local_dir)
                logger.info(
                    f"Done. Total time: {time.monotonic() - t0:.2f} sec")

                t0 = time.monotonic()
                logger.info(f"Uploading {member.name}")
                hdfs_path = os.path.join(hdfs_dir, member.name)
                local_path = os.path.join(local_dir, member.name)
                utils.upload_to_HDFS(hdfs_path, local_path)
                logger.info(
                    f"Done. Total time: {time.monotonic() - t0:.2f} sec")
Exemple #14
0
    def test_save_dataframe(self):
        path_ = '/test_df.parquet'
        df = utils.create_dataframe(Row(column1=1, column2=2), schema=None)
        create_dataframes.save_dataframe(df, path_)

        status = utils.path_exists(path_)
        self.assertTrue(status)
Exemple #15
0
 def test_save_model(self):
     training_data, validation_data, test_data = super().split_playcounts()
     best_model, _, best_model_metadata = train_models.train(training_data, validation_data,
         validation_data.count(), self.ranks, self.lambdas, self.iterations)
     model_save_path = os.path.join('/test/model', best_model_metadata['model_id'])
     train_models.save_model(model_save_path, best_model_metadata['model_id'], best_model)
     model_exist = utils.path_exists(model_save_path)
     self.assertTrue(model_exist)
 def test_upload_to_HDFS(self):
     temp_file = tempfile.mkdtemp()
     local_path = os.path.join(temp_file, 'test_file.txt')
     with open(local_path, 'w') as f:
         f.write('test file')
     self.path_ = '/test/upload.parquet'
     utils.upload_to_HDFS(self.path_, local_path)
     status = utils.path_exists(self.path_)
     self.assertTrue(status)
    def test_insert_dump_data_file_missing(self):
        """ Test to ensure a file is created if it is missing. """
        path_found = path_exists(self.path_)
        if path_found:
            delete_dir(self.path_, recursive=True)

        self.assertFalse(import_utils.search_dump(1, "full", datetime.fromtimestamp(1)))
        import_utils.insert_dump_data(1, "full", datetime.fromtimestamp(1))
        self.assertTrue(import_utils.search_dump(1, "full", datetime.fromtimestamp(1)))
    def test_get_recordings_dataframe(self):
        metadata = {}
        mapped_listens = utils.read_files_from_HDFS(self.mapped_listens_path)
        recordings_df = create_dataframes.get_recordings_df(mapped_listens, metadata)
        self.assertEqual(recordings_df.count(), 3)
        self.assertListEqual(sorted(self.get_recordings_df().columns), sorted(recordings_df.columns))
        self.assertEqual(metadata['recordings_count'], 3)

        status = utils.path_exists(path.RECORDINGS_DATAFRAME_PATH)
        self.assertTrue(status)
    def test_save_dataframe_metadata_to_HDFS(self):
        df_id = "3acb406f-c716-45f8-a8bd-96ca3939c2e5"
        metadata = self.get_dataframe_metadata(df_id)
        create_dataframes.save_dataframe_metadata_to_hdfs(metadata, RECOMMENDATION_RECORDING_DATAFRAME_METADATA)

        status = utils.path_exists(RECOMMENDATION_RECORDING_DATAFRAME_METADATA)
        self.assertTrue(status)

        df = utils.read_files_from_HDFS(RECOMMENDATION_RECORDING_DATAFRAME_METADATA)
        self.assertCountEqual(df.columns, schema.dataframe_metadata_schema.fieldNames())
    def test_get_users_dataframe(self):
        metadata = {}
        mapped_listens = utils.read_files_from_HDFS(self.mapped_listens_path)
        users_df = create_dataframes.get_users_dataframe(mapped_listens, metadata)
        self.assertEqual(users_df.count(), 2)
        self.assertListEqual(sorted(self.get_users_df().columns), sorted(users_df.columns))
        self.assertEqual(metadata['users_count'], users_df.count())

        status = utils.path_exists(path.USERS_DATAFRAME_PATH)
        self.assertTrue(status)
    def test_get_users_dataframe(self):
        metadata = {}
        mapped_df = utils.read_files_from_HDFS(MAPPED_LISTENS_PATH)
        users_df = create_dataframes.get_users_dataframe(mapped_df, metadata)
        self.assertEqual(users_df.count(), 1)
        self.assertListEqual(['user_name', 'user_id'], users_df.columns)
        self.assertEqual(metadata['users_count'], users_df.count())

        status = utils.path_exists(path.USERS_DATAFRAME_PATH)
        self.assertTrue(status)
    def test_save_dataframe_metadata_to_HDFS(self):
        df_id = "3acb406f-c716-45f8-a8bd-96ca3939c2e5"
        metadata = self.get_dataframe_metadata(df_id)
        create_dataframes.save_dataframe_metadata_to_hdfs(metadata)

        status = utils.path_exists(path.DATAFRAME_METADATA)
        self.assertTrue(status)

        df = utils.read_files_from_HDFS(path.DATAFRAME_METADATA)
        self.assertTrue(sorted(df.columns), sorted(schema.dataframe_metadata_schema.fieldNames()))
    def test_get_recordings_dataframe(self):
        metadata = {}
        mapped_listens = utils.read_files_from_HDFS(RECOMMENDATION_RECORDING_MAPPED_LISTENS)
        recordings_df = create_dataframes.get_recordings_df(mapped_listens, metadata, RECOMMENDATION_RECORDINGS_DATAFRAME)
        self.assertEqual(recordings_df.count(), 20)
        self.assertCountEqual(['artist_credit_id', 'recording_id', 'recording_mbid'], recordings_df.columns)
        self.assertEqual(metadata['recordings_count'], 20)

        status = utils.path_exists(RECOMMENDATION_RECORDINGS_DATAFRAME)
        self.assertTrue(status)
Exemple #24
0
    def test_save_model_metadata_to_hdfs(self):
        model_id = "3acb406f-c716-45f8-a8bd-96ca3939c2e5"
        metadata = self.get_model_metadata(model_id)

        train_models.save_model_metadata_to_hdfs(metadata)

        status = utils.path_exists(path.MODEL_METADATA)
        self.assertTrue(status)

        df = utils.read_files_from_HDFS(path.MODEL_METADATA)
        self.assertTrue(sorted(df.columns), sorted(schema.model_metadata_schema.fieldNames()))
Exemple #25
0
    def test_upload_archive_failed(self):
        faulty_tar = MagicMock()
        faulty_tar.extract.side_effect = tarfile.ReadError()
        member = MagicMock()
        faulty_tar.__iter__.return_value = [member]

        tmp_dump_dir = tempfile.mkdtemp()
        self.assertRaises(DumpInvalidException, self.uploader.upload_archive, tmp_dump_dir,
                          faulty_tar, '/test', schema.artist_relation_schema, self.uploader.process_json)

        status = utils.path_exists('/test')
        self.assertFalse(status)
Exemple #26
0
    def test_get_mapped_artist_and_recording_mbids(self):
        partial_listen_df = create_dataframes.get_listens_for_training_model_window(
            self.date, self.date, {}, self.listens_path)
        mapping_df = utils.read_files_from_HDFS(self.mapping_path)

        mapped_listens = create_dataframes.get_mapped_artist_and_recording_mbids(
            partial_listen_df, mapping_df)
        self.assertEqual(mapped_listens.count(), 1)
        self.assertListEqual(sorted(self.get_mapped_listens().columns),
                             sorted(mapped_listens.columns))
        status = utils.path_exists(path.MAPPED_LISTENS)
        self.assertTrue(status)
    def test_get_recordings_dataframe(self):
        metadata = {}
        mapped_df = utils.read_files_from_HDFS(MAPPED_LISTENS_PATH)
        recordings_df = create_dataframes.get_recordings_df(
            mapped_df, metadata)
        self.assertEqual(recordings_df.count(), 1)
        self.assertListEqual(
            ['mb_recording_mbid', 'mb_artist_credit_id', 'recording_id'],
            recordings_df.columns)
        self.assertEqual(metadata['recordings_count'], 1)

        status = utils.path_exists(path.RECORDINGS_DATAFRAME_PATH)
        self.assertTrue(status)
 def test_save_dataframe_metadata_to_HDFS(self):
     metadata = {
         'from_date': self.date,
         'to_date': self.date,
         'listens_count': 1,
         'model_id': '1',
         'playcounts_count': 1,
         'recordings_count': 1,
         'updated': True,
         'users_count': 1
     }
     create_dataframes.save_dataframe_metadata_to_HDFS(metadata)
     status = utils.path_exists(path.MODEL_METADATA)
     self.assertTrue(status)
Exemple #29
0
    def test_upload_archive(self):
        archive_path = self.create_test_tar()
        pxz = self.uploader.get_pxz_output(archive_path)
        tmp_dump_dir = tempfile.mkdtemp()

        with tarfile.open(fileobj=pxz.stdout, mode='r|') as tar:
            self.uploader.upload_archive(tmp_dump_dir, tar, '/artist_relations.parquet',
                                         schema.artist_relation_schema, self.uploader.process_json)

        df = utils.read_files_from_HDFS('/artist_relations.parquet')
        self.assertEqual(df.count(), 1)

        status = utils.path_exists(tmp_dump_dir)
        self.assertFalse(status)

        utils.delete_dir('/artist_relations.parquet', recursive=True)
Exemple #30
0
    def test_get_playcounts_df(self):
        metadata = {}
        mapped_listens = utils.read_files_from_HDFS(self.mapped_listens_path)
        users_df = create_dataframes.get_users_dataframe(mapped_listens, {})
        recordings_df = create_dataframes.get_recordings_df(mapped_listens, {})
        listens_df = create_dataframes.get_listens_df(mapped_listens, {})

        playcounts_df = create_dataframes.get_playcounts_df(
            listens_df, recordings_df, users_df, metadata)
        self.assertEqual(playcounts_df.count(), 1)
        self.assertListEqual(['user_id', 'recording_id', 'count'],
                             playcounts_df.columns)
        self.assertEqual(metadata['playcounts_count'], playcounts_df.count())

        status = utils.path_exists(path.PLAYCOUNTS_DATAFRAME_PATH)
        self.assertTrue(status)