コード例 #1
0
    def upload_release_json_dump(self, archive: str):
        """ Decompress archive and upload artist relation to HDFS.

            Args:
                archive: artist relation tar file to upload.
        """
        hdfs_dir = path.MUSICBRAINZ_RELEASE_DUMP
        hdfs_mbdump_dir = os.path.join(
            hdfs_dir, "mbdump"
        )  # release.tar.xz file has actual dump file inside mbdump dir
        with tarfile.open(name=archive, mode="r:xz"
                          ) as tar, tempfile.TemporaryDirectory() as local_dir:
            # Remove existing dumps
            if utils.path_exists(hdfs_dir):
                utils.delete_dir(hdfs_dir, recursive=True)

            utils.create_dir(hdfs_dir)

            for member in tar:
                t0 = time.monotonic()
                logger.info(f"Extracting {member.name}")
                tar.extract(member, path=local_dir)
                logger.info(
                    f"Done. Total time: {time.monotonic() - t0:.2f} sec")

                t0 = time.monotonic()
                logger.info(f"Uploading {member.name}")
                hdfs_path = os.path.join(hdfs_dir, member.name)
                local_path = os.path.join(local_dir, member.name)
                utils.upload_to_HDFS(hdfs_path, local_path)
                logger.info(
                    f"Done. Total time: {time.monotonic() - t0:.2f} sec")
コード例 #2
0
 def setUpClass(cls) -> None:
     super(RecommendationsTestCase, cls).setUpClass()
     utils.upload_to_HDFS(
         LISTENBRAINZ_NEW_DATA_DIRECTORY,
         os.path.join(TEST_DATA_PATH, 'rec_listens.parquet'))
     utils.upload_to_HDFS(
         RECOMMENDATION_RECORDING_MAPPED_LISTENS,
         os.path.join(TEST_DATA_PATH, 'mapped_listens.parquet'))
コード例 #3
0
 def test_upload_to_HDFS(self):
     temp_file = tempfile.mkdtemp()
     local_path = os.path.join(temp_file, 'test_file.txt')
     with open(local_path, 'w') as f:
         f.write('test file')
     self.path_ = '/test/upload.parquet'
     utils.upload_to_HDFS(self.path_, local_path)
     status = utils.path_exists(self.path_)
     self.assertTrue(status)
コード例 #4
0
ファイル: __init__.py プロジェクト: phw/listenbrainz-server
    def extract_and_upload_archive(self,
                                   archive,
                                   local_dir,
                                   hdfs_dir,
                                   cleanup_on_failure=True):
        """
        Extract the archive and upload it to the given hdfs directory.
        Args:
            archive: path to the tar archive to uploaded
            local_dir: path to local dir to be used for extraction
            hdfs_dir: path to hdfs dir where contents of tar should be uploaded
            cleanup_on_failure: whether to delete local and hdfs directories
                if error occurs during extraction
        """
        total_files = 0
        total_time = 0.0
        with tarfile.open(archive, mode='r') as tar:
            for member in tar:
                if member.isfile() and member.name.endswith(".parquet"):
                    logger.info(f"Uploading {member.name}...")
                    t0 = time.monotonic()

                    try:
                        tar.extract(member, path=local_dir)
                    except tarfile.TarError as err:
                        if cleanup_on_failure:
                            if utils.path_exists(hdfs_dir):
                                utils.delete_dir(hdfs_dir, recursive=True)
                            shutil.rmtree(local_dir, ignore_errors=True)
                        raise DumpInvalidException(
                            f"{type(err).__name__} while extracting {member.name}, aborting import"
                        )

                    hdfs_path = os.path.join(hdfs_dir, member.name)
                    local_path = os.path.join(local_dir, member.name)
                    utils.upload_to_HDFS(hdfs_path, local_path)

                    time_taken = time.monotonic() - t0
                    total_files += 1
                    total_time += time_taken
                    logger.info(
                        f"Done! Current file processed in {time_taken:.2f} sec"
                    )
        logger.info(
            f"Done! Total files processed {total_files}. Average time taken: {total_time / total_files:.2f}"
        )
コード例 #5
0
    def upload_archive(self, tmp_dump_dir, tar, dest_path, schema, callback=None, force=False):
        """ Upload data dump to HDFS.

            Args:
                tmp_dump_dir (str): Path to temporary directory to upload JSON.
                tar: Uncompressed tar object.
                dest_path (str): HDFS path to upload data dump.
                schema: Schema of parquet to be uploaded.
                callback: Function to process JSON files.
                force: If True deletes dir at dest_path
        """
        if callback is None:
            raise NotImplementedError('Callback to process JSON missing. Aboritng...')

        if force:
            current_app.logger.info('Removing {} from HDFS...'.format(dest_path))
            utils.delete_dir(dest_path, recursive=True)
            current_app.logger.info('Done!')

        file_count = 0
        total_time = 0.0
        for member in tar:
            if member.isfile() and self._is_json_file(member.name):
                current_app.logger.info('Loading {}...'.format(member.name))
                t = time.time()
                tar.extract(member)
                tmp_hdfs_path = os.path.join(tmp_dump_dir, member.name)
                utils.upload_to_HDFS(tmp_hdfs_path, member.name)
                callback(member.name, dest_path, tmp_hdfs_path, schema)
                utils.delete_dir(tmp_hdfs_path, recursive=True)
                os.remove(member.name)
                file_count += 1
                time_taken = time.time() - t
                current_app.logger.info("Done! Processed {} files. Current file done in {:.2f} sec".format(
                    file_count, time_taken))
                total_time += time_taken
                average_time = total_time / file_count
                current_app.logger.info("Total time: {:.2f}, average time: {:.2f}".format(total_time, average_time))
        utils.delete_dir(tmp_dump_dir, recursive=True)
        shutil.rmtree(tmp_dump_dir)
コード例 #6
0
ファイル: __init__.py プロジェクト: phw/listenbrainz-server
    def upload_archive(self,
                       tmp_dump_dir,
                       tar,
                       dest_path,
                       schema,
                       callback=None,
                       overwrite=False):
        """ Upload data dump to HDFS.

            Args:
                tmp_dump_dir (str): Path to temporary directory to upload JSON.
                tar: Uncompressed tar object.
                dest_path (str): HDFS path to upload data dump.
                schema: Schema of parquet to be uploaded.
                callback: Function to process JSON files.
                overwrite: If True deletes dir at dest_path
        """
        if callback is None:
            raise NotImplementedError(
                'Callback to process JSON missing. Aborting...')

        # Delete TEMP_DIR_PATH if it exists
        if utils.path_exists(TEMP_DIR_PATH):
            utils.delete_dir(TEMP_DIR_PATH, recursive=True)

        # Copy data from dest_path to TEMP_DIR_PATH to be merged with new data
        if not overwrite and utils.path_exists(dest_path):
            t0 = time.monotonic()
            logger.info("Copying old listens into '{}'".format(TEMP_DIR_PATH))
            utils.copy(dest_path, TEMP_DIR_PATH, overwrite=True)
            logger.info("Done! Time taken: {:.2f}".format(time.monotonic() -
                                                          t0))

        logger.info("Uploading listens to temporary directory in HDFS...")
        total_files = 0
        total_time = 0.0
        for member in tar:
            if member.isfile() and self._is_json_file(member.name):
                logger.info("Uploading {}...".format(member.name))
                t0 = time.monotonic()

                try:
                    tar.extract(member)
                except TarError as err:
                    # Cleanup
                    if utils.path_exists(TEMP_DIR_PATH):
                        utils.delete_dir(TEMP_DIR_PATH, recursive=True)
                    if utils.path_exists(tmp_dump_dir):
                        utils.delete_dir(tmp_dump_dir, recursive=True)
                    raise DumpInvalidException(
                        "{} while extracting {}, aborting import".format(
                            type(err).__name__, member.name))

                tmp_hdfs_path = os.path.join(tmp_dump_dir, member.name)
                utils.upload_to_HDFS(tmp_hdfs_path, member.name)
                callback(member.name, TEMP_DIR_PATH, tmp_hdfs_path,
                         not overwrite, schema)
                utils.delete_dir(tmp_hdfs_path, recursive=True)
                os.remove(member.name)
                time_taken = time.monotonic() - t0
                total_files += 1
                total_time += time_taken
                logger.info(
                    "Done! Current file processed in {:.2f} sec".format(
                        time_taken))
        logger.info(
            "Done! Total files processed {}. Average time taken: {:.2f}".
            format(total_files, total_time / total_files))

        # Delete dest_path if present
        if utils.path_exists(dest_path):
            logger.info('Removing {} from HDFS...'.format(dest_path))
            utils.delete_dir(dest_path, recursive=True)
            logger.info('Done!')

        logger.info("Moving the processed files to {}".format(dest_path))
        t0 = time.monotonic()

        # Check if parent directory exists, if not create a directory
        dest_path_parent = pathlib.Path(dest_path).parent
        if not utils.path_exists(dest_path_parent):
            utils.create_dir(dest_path_parent)

        utils.rename(TEMP_DIR_PATH, dest_path)
        utils.logger.info("Done! Time taken: {:.2f}".format(time.monotonic() -
                                                            t0))

        # Cleanup
        utils.delete_dir(tmp_dump_dir, recursive=True)