def upload_new_listens_full_dump(self, archive: str): """ Upload new format parquet listens dumps to of a full dump to HDFS. Args: archive: path to parquet listens dump to be uploaded """ src_path = self.upload_archive_to_temp(archive) dest_path = path.LISTENBRAINZ_NEW_DATA_DIRECTORY # Delete existing dumps if any if utils.path_exists(dest_path): logger.info(f'Removing {dest_path} from HDFS...') utils.delete_dir(dest_path, recursive=True) logger.info('Done!') logger.info(f"Moving the processed files from {src_path} to {dest_path}") t0 = time.monotonic() # Check if parent directory exists, if not create a directory dest_path_parent = str(Path(dest_path).parent) if not utils.path_exists(dest_path_parent): utils.create_dir(dest_path_parent) utils.rename(src_path, dest_path) utils.logger.info(f"Done! Time taken: {time.monotonic() - t0:.2f}")
def test_rename(self): utils.create_dir(self.path_) test_exists = utils.path_exists(self.path_) self.assertTrue(test_exists) utils.rename(self.path_, self.temp_path_) test_exists = utils.path_exists(self.path_) self.assertFalse(test_exists) temp_exists = utils.path_exists(self.temp_path_) self.assertTrue(temp_exists) utils.delete_dir(self.temp_path_)
def test_get_latest_full_dump_no_full(self): """ Test to ensure 'None' is returned if not full import has been made. """ # Remove full dump entries from parquet import_meta_df = read_files_from_HDFS(self.path_) result = import_meta_df.filter(import_meta_df.dump_type != "full") # We have to save the dataframe as a different file and move it as the df itself is read from the file save_parquet(result, '/temp.parquet') delete_dir(self.path_, recursive=True) rename('/temp.parquet', self.path_) self.assertIsNone(import_utils.get_latest_full_dump())
def insert_dump_data(dump_id: int, dump_type: str, imported_at: datetime): """ Insert information about dump imported """ import_meta_df = None try: import_meta_df = read_files_from_HDFS(IMPORT_METADATA) except PathNotFoundException: current_app.logger.info("Import metadata file not found, creating...") data = create_dataframe(Row(dump_id, dump_type, imported_at), schema=import_metadata_schema) if import_meta_df: result = import_meta_df \ .filter(f"dump_id != '{dump_id}' OR dump_type != '{dump_type}'") \ .union(data) else: result = data # We have to save the dataframe as a different file and move it as the df itself is read from the file save_parquet(result, "/temp.parquet") if path_exists(IMPORT_METADATA): delete_dir(IMPORT_METADATA, recursive=True) rename("/temp.parquet", IMPORT_METADATA)
def upload_archive(self, tmp_dump_dir, tar, dest_path, schema, callback=None, overwrite=False): """ Upload data dump to HDFS. Args: tmp_dump_dir (str): Path to temporary directory to upload JSON. tar: Uncompressed tar object. dest_path (str): HDFS path to upload data dump. schema: Schema of parquet to be uploaded. callback: Function to process JSON files. overwrite: If True deletes dir at dest_path """ if callback is None: raise NotImplementedError( 'Callback to process JSON missing. Aborting...') # Delete TEMP_DIR_PATH if it exists if utils.path_exists(TEMP_DIR_PATH): utils.delete_dir(TEMP_DIR_PATH, recursive=True) # Copy data from dest_path to TEMP_DIR_PATH to be merged with new data if not overwrite and utils.path_exists(dest_path): t0 = time.monotonic() logger.info("Copying old listens into '{}'".format(TEMP_DIR_PATH)) utils.copy(dest_path, TEMP_DIR_PATH, overwrite=True) logger.info("Done! Time taken: {:.2f}".format(time.monotonic() - t0)) logger.info("Uploading listens to temporary directory in HDFS...") total_files = 0 total_time = 0.0 for member in tar: if member.isfile() and self._is_json_file(member.name): logger.info("Uploading {}...".format(member.name)) t0 = time.monotonic() try: tar.extract(member) except TarError as err: # Cleanup if utils.path_exists(TEMP_DIR_PATH): utils.delete_dir(TEMP_DIR_PATH, recursive=True) if utils.path_exists(tmp_dump_dir): utils.delete_dir(tmp_dump_dir, recursive=True) raise DumpInvalidException( "{} while extracting {}, aborting import".format( type(err).__name__, member.name)) tmp_hdfs_path = os.path.join(tmp_dump_dir, member.name) utils.upload_to_HDFS(tmp_hdfs_path, member.name) callback(member.name, TEMP_DIR_PATH, tmp_hdfs_path, not overwrite, schema) utils.delete_dir(tmp_hdfs_path, recursive=True) os.remove(member.name) time_taken = time.monotonic() - t0 total_files += 1 total_time += time_taken logger.info( "Done! Current file processed in {:.2f} sec".format( time_taken)) logger.info( "Done! Total files processed {}. Average time taken: {:.2f}". format(total_files, total_time / total_files)) # Delete dest_path if present if utils.path_exists(dest_path): logger.info('Removing {} from HDFS...'.format(dest_path)) utils.delete_dir(dest_path, recursive=True) logger.info('Done!') logger.info("Moving the processed files to {}".format(dest_path)) t0 = time.monotonic() # Check if parent directory exists, if not create a directory dest_path_parent = pathlib.Path(dest_path).parent if not utils.path_exists(dest_path_parent): utils.create_dir(dest_path_parent) utils.rename(TEMP_DIR_PATH, dest_path) utils.logger.info("Done! Time taken: {:.2f}".format(time.monotonic() - t0)) # Cleanup utils.delete_dir(tmp_dump_dir, recursive=True)