def export_cli(tag, dry_run, dbfs_path, delete, git_ssh_url, api_client: ApiClient, hcl, pattern_matches): if hcl: log.debug("this if debug") service = DbfsService(api_client) files = get_dbfs_files_recursive(service, dbfs_path) log.info(files) with GitExportHandler(git_ssh_url, "dbfs", delete_not_found=delete, dry_run=dry_run, tag=tag) as gh: for file in files: assert "path" in file assert "is_dir" in file assert "file_size" in file if file["is_dir"]: continue base_name = file["path"] identifier = normalize_identifier( f"databricks_dbfs_file-{base_name}") dbfs_resource_data = { "@expr:source": f'pathexpand("{identifier}")', "@expr:content_b64_md5": f'md5(filebase64(pathexpand("{identifier}")))', "path": file["path"], "overwrite": True, "mkdirs": True, "validate_remote_file": True, } name = "databricks_dbfs_file" dbfs_file_hcl = create_resource_from_dict( name, identifier, dbfs_resource_data, False) processed_hcl_file = create_hcl_file(file['path'], api_client.url, dbfs_resource_data, dbfs_file_hcl) gh.add_file(f"{identifier}.tf", processed_hcl_file) gh.add_file(f"files/{identifier}", get_file_contents(service, file["path"])) hcl_errors = validate_hcl(dbfs_file_hcl) if len(hcl_errors) > 0: log.error( f"Identified error in the following HCL Config: {dbfs_file_hcl}" ) log.error(hcl_errors)
def _get_dbfs_file_data_recrusive(service: DbfsService, path): resp = service.list(path) if "files" not in resp: return [] files = resp["files"] output = [] for file in files: if file["is_dir"] is True: output += _get_dbfs_file_data_recrusive(service, file["path"]) else: output.append(file) return output
def get_file_contents(dbfs_service: DbfsService, dbfs_path: Text, headers=None): abs_path = f"dbfs:{dbfs_path}" json = dbfs_service.get_status(abs_path, headers=headers) file_info = FileInfo.from_json(json) if file_info.is_dir: error_and_quit('The dbfs file {} is a directory.'.format( repr(abs_path))) length = file_info.file_size offset = 0 output = io.StringIO() while offset < length: response = dbfs_service.read(abs_path, offset, BUFFER_SIZE_BYTES, headers=headers) bytes_read = response['bytes_read'] data = response['data'] offset += bytes_read output.write(b64decode(data).decode("utf-8")) return output.getvalue()
def get_dbfs_client(): api_client = _get_api_client() return DbfsService(api_client)
def __init__(self, api_client): self.client = DbfsService(api_client)
class DbfsApi(object): MULTIPART_UPLOAD_LIMIT = 2147483648 def __init__(self, api_client): self.client = DbfsService(api_client) def list_files(self, dbfs_path, headers=None): list_response = self.client.list(dbfs_path.absolute_path, headers=headers) if 'files' in list_response: return [FileInfo.from_json(f) for f in list_response['files']] else: return [] def file_exists(self, dbfs_path, headers=None): try: self.get_status(dbfs_path, headers=headers) except HTTPError as e: try: if e.response.json()['error_code'] == DbfsErrorCodes.RESOURCE_DOES_NOT_EXIST: return False except ValueError: pass raise e return True def get_status(self, dbfs_path, headers=None): json = self.client.get_status(dbfs_path.absolute_path, headers=headers) return FileInfo.from_json(json) # Method makes multipart/form-data file upload for files <2GB. # Otherwise uses create, add-block, close methods for streaming upload. def put_file(self, src_path, dbfs_path, overwrite, headers=None): # If file size is >2Gb use streaming upload. if os.path.getsize(src_path) < self.MULTIPART_UPLOAD_LIMIT: self.client.put(dbfs_path.absolute_path, src_path=src_path, overwrite=overwrite, headers=headers) else: handle = self.client.create(dbfs_path.absolute_path, overwrite, headers=headers)['handle'] with open(src_path, 'rb') as local_file: while True: contents = local_file.read(BUFFER_SIZE_BYTES) if len(contents) == 0: break # add_block should not take a bytes object. self.client.add_block(handle, b64encode(contents).decode(), headers=headers) self.client.close(handle, headers=headers) def get_file(self, dbfs_path, dst_path, overwrite, headers=None): if os.path.exists(dst_path) and not overwrite: raise LocalFileExistsException('{} exists already.'.format(dst_path)) file_info = self.get_status(dbfs_path, headers=headers) if file_info.is_dir: error_and_quit(('The dbfs file {} is a directory.').format(repr(dbfs_path))) length = file_info.file_size offset = 0 with open(dst_path, 'wb') as local_file: while offset < length: response = self.client.read(dbfs_path.absolute_path, offset, BUFFER_SIZE_BYTES, headers=headers) bytes_read = response['bytes_read'] data = response['data'] offset += bytes_read local_file.write(b64decode(data)) @staticmethod def get_num_files_deleted(partial_delete_error): try: message = partial_delete_error.response.json()['message'] except (AttributeError, KeyError): raise ParseException("Unable to retrieve the number of deleted files.") m = re.compile(r".*operation has deleted (\d+) files.*").match(message) if not m: raise ParseException( "Unable to retrieve the number of deleted files from the error message: {}".format( message)) return int(m.group(1)) def delete(self, dbfs_path, recursive, headers=None): num_files_deleted = 0 while True: try: self.client.delete(dbfs_path.absolute_path, recursive=recursive, headers=headers) except HTTPError as e: if e.response.status_code == 503: try: error_code = e.response.json()['error_code'] except (AttributeError, KeyError): error_code = None # Handle partial delete exceptions: retry until all the files have been deleted if error_code == DbfsErrorCodes.PARTIAL_DELETE: try: num_files_deleted += DbfsApi.get_num_files_deleted(e) click.echo("\rDeleted {} files. Delete in progress...\033[K".format( num_files_deleted), nl=False) except ParseException: click.echo("\rDelete in progress...\033[K", nl=False) continue click.echo("\rDeleted at least {} files but interrupted by error.\033[K".format( num_files_deleted)) raise e break click.echo("\rDelete finished successfully.\033[K") def mkdirs(self, dbfs_path, headers=None): self.client.mkdirs(dbfs_path.absolute_path, headers=headers) def move(self, dbfs_src, dbfs_dst, headers=None): self.client.move(dbfs_src.absolute_path, dbfs_dst.absolute_path, headers=headers) def _copy_to_dbfs_non_recursive(self, src, dbfs_path_dst, overwrite, headers=None): # Munge dst path in case dbfs_path_dst is a dir try: if self.get_status(dbfs_path_dst, headers=headers).is_dir: dbfs_path_dst = dbfs_path_dst.join(os.path.basename(src)) except HTTPError as e: if e.response.json()['error_code'] == DbfsErrorCodes.RESOURCE_DOES_NOT_EXIST: pass else: raise e self.put_file(src, dbfs_path_dst, overwrite, headers=headers) def _copy_from_dbfs_non_recursive(self, dbfs_path_src, dst, overwrite, headers=None): # Munge dst path in case dst is a dir if os.path.isdir(dst): dst = os.path.join(dst, dbfs_path_src.basename) self.get_file(dbfs_path_src, dst, overwrite, headers=headers) def _copy_to_dbfs_recursive(self, src, dbfs_path_dst, overwrite, headers=None): try: self.mkdirs(dbfs_path_dst, headers=headers) except HTTPError as e: if e.response.json()['error_code'] == DbfsErrorCodes.RESOURCE_ALREADY_EXISTS: click.echo(e.response.json()) return for filename in os.listdir(src): cur_src = os.path.join(src, filename) cur_dbfs_dst = dbfs_path_dst.join(filename) if os.path.isdir(cur_src): self._copy_to_dbfs_recursive(cur_src, cur_dbfs_dst, overwrite, headers=headers) elif os.path.isfile(cur_src): try: self.put_file(cur_src, cur_dbfs_dst, overwrite, headers=headers) click.echo('{} -> {}'.format(cur_src, cur_dbfs_dst)) except HTTPError as e: if e.response.json()['error_code'] == DbfsErrorCodes.RESOURCE_ALREADY_EXISTS: click.echo('{} already exists. Skip.'.format(cur_dbfs_dst)) else: raise e def _copy_from_dbfs_recursive(self, dbfs_path_src, dst, overwrite, headers=None): if os.path.isfile(dst): click.echo( '{} exists as a file. Skipping this subtree {}'.format(dst, repr(dbfs_path_src))) return elif not os.path.isdir(dst): os.makedirs(dst) for dbfs_src_file_info in self.list_files(dbfs_path_src, headers=headers): cur_dbfs_src = dbfs_src_file_info.dbfs_path cur_dst = os.path.join(dst, cur_dbfs_src.basename) if dbfs_src_file_info.is_dir: self._copy_from_dbfs_recursive(cur_dbfs_src, cur_dst, overwrite, headers=headers) else: try: self.get_file(cur_dbfs_src, cur_dst, overwrite, headers=headers) click.echo('{} -> {}'.format(cur_dbfs_src, cur_dst)) except LocalFileExistsException: click.echo(('{} already exists locally as {}. Skip. To overwrite, you ' + 'should provide the --overwrite flag.').format(cur_dbfs_src, cur_dst)) def cp(self, recursive, overwrite, src, dst, headers=None): if not DbfsPath.is_valid(src) and DbfsPath.is_valid(dst): if not os.path.exists(src): error_and_quit('The local file {} does not exist.'.format(src)) if not recursive: if os.path.isdir(src): error_and_quit( ('The local file {} is a directory. You must provide --recursive') .format(src)) self._copy_to_dbfs_non_recursive(src, DbfsPath(dst), overwrite, headers=headers) else: if not os.path.isdir(src): self._copy_to_dbfs_non_recursive(src, DbfsPath(dst), overwrite, headers=headers) return self._copy_to_dbfs_recursive(src, DbfsPath(dst), overwrite, headers=headers) # Copy from DBFS in this case elif DbfsPath.is_valid(src) and not DbfsPath.is_valid(dst): if not recursive: self._copy_from_dbfs_non_recursive(DbfsPath(src), dst, overwrite, headers=headers) else: dbfs_path_src = DbfsPath(src) if not self.get_status(dbfs_path_src, headers=headers).is_dir: self._copy_from_dbfs_non_recursive(dbfs_path_src, dst, overwrite, headers=headers) self._copy_from_dbfs_recursive(dbfs_path_src, dst, overwrite, headers=headers) elif not DbfsPath.is_valid(src) and not DbfsPath.is_valid(dst): error_and_quit('Both paths provided are from your local filesystem. ' 'To use this utility, one of the src or dst must be prefixed ' 'with dbfs:/') elif DbfsPath.is_valid(src) and DbfsPath.is_valid(dst): with TempDir() as temp_dir: # Always copy to <temp_dir>/temp since this will work no matter if it's a # recursive or a non-recursive copy. temp_path = temp_dir.path('temp') self.cp(recursive, True, src, temp_path) self.cp(recursive, overwrite, temp_path, dst) else: assert False, 'not reached' def cat(self, src): with TempDir() as temp_dir: temp_path = temp_dir.path('temp') self.cp(False, True, src, temp_path) with open(temp_path) as f: click.echo(f.read(), nl=False)
class DbfsApi(object): def __init__(self, api_client): self.client = DbfsService(api_client) def list_files(self, dbfs_path, headers=None): list_response = self.client.list(dbfs_path.absolute_path, headers=headers) if 'files' in list_response: return [FileInfo.from_json(f) for f in list_response['files']] else: return [] def file_exists(self, dbfs_path, headers=None): try: self.get_status(dbfs_path, headers=headers) except HTTPError as e: if e.response.json()['error_code'] == DbfsErrorCodes.RESOURCE_DOES_NOT_EXIST: return False raise e return True def get_status(self, dbfs_path, headers=None): json = self.client.get_status(dbfs_path.absolute_path, headers=headers) return FileInfo.from_json(json) def put_file(self, src_path, dbfs_path, overwrite, headers=None): handle = self.client.create(dbfs_path.absolute_path, overwrite, headers=headers)['handle'] with open(src_path, 'rb') as local_file: while True: contents = local_file.read(BUFFER_SIZE_BYTES) if len(contents) == 0: break # add_block should not take a bytes object. self.client.add_block(handle, b64encode(contents).decode(), headers=headers) self.client.close(handle, headers=headers) def get_file(self, dbfs_path, dst_path, overwrite, headers=None): if os.path.exists(dst_path) and not overwrite: raise LocalFileExistsException('{} exists already.'.format(dst_path)) file_info = self.get_status(dbfs_path, headers=headers) if file_info.is_dir: error_and_quit(('The dbfs file {} is a directory.').format(repr(dbfs_path))) length = file_info.file_size offset = 0 with open(dst_path, 'wb') as local_file: while offset < length: response = self.client.read(dbfs_path.absolute_path, offset, BUFFER_SIZE_BYTES, headers=headers) bytes_read = response['bytes_read'] data = response['data'] offset += bytes_read local_file.write(b64decode(data)) def delete(self, dbfs_path, recursive, headers=None): self.client.delete(dbfs_path.absolute_path, recursive=recursive, headers=headers) def mkdirs(self, dbfs_path, headers=None): self.client.mkdirs(dbfs_path.absolute_path, headers=headers) def move(self, dbfs_src, dbfs_dst, headers=None): self.client.move(dbfs_src.absolute_path, dbfs_dst.absolute_path, headers=headers) def _copy_to_dbfs_non_recursive(self, src, dbfs_path_dst, overwrite, headers=None): # Munge dst path in case dbfs_path_dst is a dir try: if self.get_status(dbfs_path_dst, headers=headers).is_dir: dbfs_path_dst = dbfs_path_dst.join(os.path.basename(src)) except HTTPError as e: if e.response.json()['error_code'] == DbfsErrorCodes.RESOURCE_DOES_NOT_EXIST: pass else: raise e self.put_file(src, dbfs_path_dst, overwrite, headers=headers) def _copy_from_dbfs_non_recursive(self, dbfs_path_src, dst, overwrite, headers=None): # Munge dst path in case dst is a dir if os.path.isdir(dst): dst = os.path.join(dst, dbfs_path_src.basename) self.get_file(dbfs_path_src, dst, overwrite, headers=headers) def _copy_to_dbfs_recursive(self, src, dbfs_path_dst, overwrite, headers=None): try: self.mkdirs(dbfs_path_dst, headers=headers) except HTTPError as e: if e.response.json()['error_code'] == DbfsErrorCodes.RESOURCE_ALREADY_EXISTS: click.echo(e.response.json()) return for filename in os.listdir(src): cur_src = os.path.join(src, filename) cur_dbfs_dst = dbfs_path_dst.join(filename) if os.path.isdir(cur_src): self._copy_to_dbfs_recursive(cur_src, cur_dbfs_dst, overwrite, headers=headers) elif os.path.isfile(cur_src): try: self.put_file(cur_src, cur_dbfs_dst, overwrite, headers=headers) click.echo('{} -> {}'.format(cur_src, cur_dbfs_dst)) except HTTPError as e: if e.response.json()['error_code'] == DbfsErrorCodes.RESOURCE_ALREADY_EXISTS: click.echo('{} already exists. Skip.'.format(cur_dbfs_dst)) else: raise e def _copy_from_dbfs_recursive(self, dbfs_path_src, dst, overwrite, headers=None): if os.path.isfile(dst): click.echo( '{} exists as a file. Skipping this subtree {}'.format(dst, repr(dbfs_path_src))) return elif not os.path.isdir(dst): os.makedirs(dst) for dbfs_src_file_info in self.list_files(dbfs_path_src, headers=headers): cur_dbfs_src = dbfs_src_file_info.dbfs_path cur_dst = os.path.join(dst, cur_dbfs_src.basename) if dbfs_src_file_info.is_dir: self._copy_from_dbfs_recursive(cur_dbfs_src, cur_dst, overwrite, headers=headers) else: try: self.get_file(cur_dbfs_src, cur_dst, overwrite, headers=headers) click.echo('{} -> {}'.format(cur_dbfs_src, cur_dst)) except LocalFileExistsException: click.echo(('{} already exists locally as {}. Skip. To overwrite, you' + 'should provide the --overwrite flag.').format(cur_dbfs_src, cur_dst)) def cp(self, recursive, overwrite, src, dst, headers=None): if not DbfsPath.is_valid(src) and DbfsPath.is_valid(dst): if not os.path.exists(src): error_and_quit('The local file {} does not exist.'.format(src)) if not recursive: if os.path.isdir(src): error_and_quit( ('The local file {} is a directory. You must provide --recursive') .format(src)) self._copy_to_dbfs_non_recursive(src, DbfsPath(dst), overwrite, headers=headers) else: if not os.path.isdir(src): self._copy_to_dbfs_non_recursive(src, DbfsPath(dst), overwrite, headers=headers) return self._copy_to_dbfs_recursive(src, DbfsPath(dst), overwrite, headers=headers) # Copy from DBFS in this case elif DbfsPath.is_valid(src) and not DbfsPath.is_valid(dst): if not recursive: self._copy_from_dbfs_non_recursive(DbfsPath(src), dst, overwrite, headers=headers) else: dbfs_path_src = DbfsPath(src) if not self.get_status(dbfs_path_src, headers=headers).is_dir: self._copy_from_dbfs_non_recursive(dbfs_path_src, dst, overwrite, headers=headers) self._copy_from_dbfs_recursive(dbfs_path_src, dst, overwrite, headers=headers) elif not DbfsPath.is_valid(src) and not DbfsPath.is_valid(dst): error_and_quit('Both paths provided are from your local filesystem. ' 'To use this utility, one of the src or dst must be prefixed ' 'with dbfs:/') elif DbfsPath.is_valid(src) and DbfsPath.is_valid(dst): error_and_quit('Both paths provided are from the DBFS filesystem. ' 'To copy between the DBFS filesystem, you currently must copy the ' 'file from DBFS to your local filesystem and then back.') else: assert False, 'not reached'
class DbfsApi(object): def __init__(self, api_client): self.client = DbfsService(api_client) def list_files(self, dbfs_path): list_response = self.client.list(dbfs_path.absolute_path) if 'files' in list_response: return [FileInfo.from_json(f) for f in list_response['files']] else: return [] def file_exists(self, dbfs_path): try: self.get_status(dbfs_path) except HTTPError as e: if e.response.json( )['error_code'] == DbfsErrorCodes.RESOURCE_DOES_NOT_EXIST: return False raise e return True def get_status(self, dbfs_path): json = self.client.get_status(dbfs_path.absolute_path) return FileInfo.from_json(json) def put_file(self, src_path, dbfs_path, overwrite): handle = self.client.create(dbfs_path.absolute_path, overwrite)['handle'] with open(src_path, 'rb') as local_file: while True: contents = local_file.read(BUFFER_SIZE_BYTES) if len(contents) == 0: break # add_block should not take a bytes object. self.client.add_block(handle, b64encode(contents).decode()) self.client.close(handle) def get_file(self, dbfs_path, dst_path, overwrite): if os.path.exists(dst_path) and not overwrite: raise LocalFileExistsException( '{} exists already.'.format(dst_path)) file_info = self.get_status(dbfs_path) if file_info.is_dir: error_and_quit( ('The dbfs file {} is a directory.').format(repr(dbfs_path))) length = file_info.file_size offset = 0 with open(dst_path, 'wb') as local_file: while offset < length: response = self.client.read(dbfs_path.absolute_path, offset, BUFFER_SIZE_BYTES) bytes_read = response['bytes_read'] data = response['data'] offset += bytes_read local_file.write(b64decode(data)) def delete(self, dbfs_path, recursive): self.client.delete(dbfs_path.absolute_path, recursive=recursive) def mkdirs(self, dbfs_path): self.client.mkdirs(dbfs_path.absolute_path) def move(self, dbfs_src, dbfs_dst): self.client.move(dbfs_src.absolute_path, dbfs_dst.absolute_path)
def __init__(self, api_client, delete_retry_delay_millis=DELETE_503_RETRY_DELAY_MILLIS): self.client = DbfsService(api_client) self.delete_retry_delay_millis = delete_retry_delay_millis