Beispiel #1
0
    def upload_new_listens_full_dump(self, archive: str):
        """ Upload new format parquet listens dumps to of a full
        dump to HDFS.

            Args:
                  archive: path to parquet listens dump to be uploaded
        """
        src_path = self.upload_archive_to_temp(archive)
        dest_path = path.LISTENBRAINZ_NEW_DATA_DIRECTORY
        # Delete existing dumps if any
        if utils.path_exists(dest_path):
            logger.info(f'Removing {dest_path} from HDFS...')
            utils.delete_dir(dest_path, recursive=True)
            logger.info('Done!')

        logger.info(f"Moving the processed files from {src_path} to {dest_path}")
        t0 = time.monotonic()

        # Check if parent directory exists, if not create a directory
        dest_path_parent = str(Path(dest_path).parent)
        if not utils.path_exists(dest_path_parent):
            utils.create_dir(dest_path_parent)

        utils.rename(src_path, dest_path)
        utils.logger.info(f"Done! Time taken: {time.monotonic() - t0:.2f}")
 def test_rename(self):
     utils.create_dir(self.path_)
     test_exists = utils.path_exists(self.path_)
     self.assertTrue(test_exists)
     utils.rename(self.path_, self.temp_path_)
     test_exists = utils.path_exists(self.path_)
     self.assertFalse(test_exists)
     temp_exists = utils.path_exists(self.temp_path_)
     self.assertTrue(temp_exists)
     utils.delete_dir(self.temp_path_)
    def test_get_latest_full_dump_no_full(self):
        """ Test to ensure 'None' is returned if not full import has been made. """
        # Remove full dump entries from parquet
        import_meta_df = read_files_from_HDFS(self.path_)
        result = import_meta_df.filter(import_meta_df.dump_type != "full")

        # We have to save the dataframe as a different file and move it as the df itself is read from the file
        save_parquet(result, '/temp.parquet')
        delete_dir(self.path_, recursive=True)
        rename('/temp.parquet', self.path_)

        self.assertIsNone(import_utils.get_latest_full_dump())
Beispiel #4
0
def insert_dump_data(dump_id: int, dump_type: str, imported_at: datetime):
    """ Insert information about dump imported """
    import_meta_df = None
    try:
        import_meta_df = read_files_from_HDFS(IMPORT_METADATA)
    except PathNotFoundException:
        current_app.logger.info("Import metadata file not found, creating...")

    data = create_dataframe(Row(dump_id, dump_type, imported_at), schema=import_metadata_schema)
    if import_meta_df:
        result = import_meta_df \
            .filter(f"dump_id != '{dump_id}' OR dump_type != '{dump_type}'") \
            .union(data)
    else:
        result = data

    # We have to save the dataframe as a different file and move it as the df itself is read from the file
    save_parquet(result, "/temp.parquet")
    if path_exists(IMPORT_METADATA):
        delete_dir(IMPORT_METADATA, recursive=True)
    rename("/temp.parquet", IMPORT_METADATA)
Beispiel #5
0
    def upload_archive(self,
                       tmp_dump_dir,
                       tar,
                       dest_path,
                       schema,
                       callback=None,
                       overwrite=False):
        """ Upload data dump to HDFS.

            Args:
                tmp_dump_dir (str): Path to temporary directory to upload JSON.
                tar: Uncompressed tar object.
                dest_path (str): HDFS path to upload data dump.
                schema: Schema of parquet to be uploaded.
                callback: Function to process JSON files.
                overwrite: If True deletes dir at dest_path
        """
        if callback is None:
            raise NotImplementedError(
                'Callback to process JSON missing. Aborting...')

        # Delete TEMP_DIR_PATH if it exists
        if utils.path_exists(TEMP_DIR_PATH):
            utils.delete_dir(TEMP_DIR_PATH, recursive=True)

        # Copy data from dest_path to TEMP_DIR_PATH to be merged with new data
        if not overwrite and utils.path_exists(dest_path):
            t0 = time.monotonic()
            logger.info("Copying old listens into '{}'".format(TEMP_DIR_PATH))
            utils.copy(dest_path, TEMP_DIR_PATH, overwrite=True)
            logger.info("Done! Time taken: {:.2f}".format(time.monotonic() -
                                                          t0))

        logger.info("Uploading listens to temporary directory in HDFS...")
        total_files = 0
        total_time = 0.0
        for member in tar:
            if member.isfile() and self._is_json_file(member.name):
                logger.info("Uploading {}...".format(member.name))
                t0 = time.monotonic()

                try:
                    tar.extract(member)
                except TarError as err:
                    # Cleanup
                    if utils.path_exists(TEMP_DIR_PATH):
                        utils.delete_dir(TEMP_DIR_PATH, recursive=True)
                    if utils.path_exists(tmp_dump_dir):
                        utils.delete_dir(tmp_dump_dir, recursive=True)
                    raise DumpInvalidException(
                        "{} while extracting {}, aborting import".format(
                            type(err).__name__, member.name))

                tmp_hdfs_path = os.path.join(tmp_dump_dir, member.name)
                utils.upload_to_HDFS(tmp_hdfs_path, member.name)
                callback(member.name, TEMP_DIR_PATH, tmp_hdfs_path,
                         not overwrite, schema)
                utils.delete_dir(tmp_hdfs_path, recursive=True)
                os.remove(member.name)
                time_taken = time.monotonic() - t0
                total_files += 1
                total_time += time_taken
                logger.info(
                    "Done! Current file processed in {:.2f} sec".format(
                        time_taken))
        logger.info(
            "Done! Total files processed {}. Average time taken: {:.2f}".
            format(total_files, total_time / total_files))

        # Delete dest_path if present
        if utils.path_exists(dest_path):
            logger.info('Removing {} from HDFS...'.format(dest_path))
            utils.delete_dir(dest_path, recursive=True)
            logger.info('Done!')

        logger.info("Moving the processed files to {}".format(dest_path))
        t0 = time.monotonic()

        # Check if parent directory exists, if not create a directory
        dest_path_parent = pathlib.Path(dest_path).parent
        if not utils.path_exists(dest_path_parent):
            utils.create_dir(dest_path_parent)

        utils.rename(TEMP_DIR_PATH, dest_path)
        utils.logger.info("Done! Time taken: {:.2f}".format(time.monotonic() -
                                                            t0))

        # Cleanup
        utils.delete_dir(tmp_dump_dir, recursive=True)