コード例 #1
0
def export_cli(tag, dry_run, dbfs_path, delete, git_ssh_url,
               api_client: ApiClient, hcl, pattern_matches):
    if hcl:
        log.debug("this if debug")
        service = DbfsService(api_client)

        files = get_dbfs_files_recursive(service, dbfs_path)
        log.info(files)

        with GitExportHandler(git_ssh_url,
                              "dbfs",
                              delete_not_found=delete,
                              dry_run=dry_run,
                              tag=tag) as gh:
            for file in files:
                assert "path" in file
                assert "is_dir" in file
                assert "file_size" in file
                if file["is_dir"]:
                    continue
                base_name = file["path"]

                identifier = normalize_identifier(
                    f"databricks_dbfs_file-{base_name}")
                dbfs_resource_data = {
                    "@expr:source": f'pathexpand("{identifier}")',
                    "@expr:content_b64_md5":
                    f'md5(filebase64(pathexpand("{identifier}")))',
                    "path": file["path"],
                    "overwrite": True,
                    "mkdirs": True,
                    "validate_remote_file": True,
                }

                name = "databricks_dbfs_file"

                dbfs_file_hcl = create_resource_from_dict(
                    name, identifier, dbfs_resource_data, False)

                processed_hcl_file = create_hcl_file(file['path'],
                                                     api_client.url,
                                                     dbfs_resource_data,
                                                     dbfs_file_hcl)

                gh.add_file(f"{identifier}.tf", processed_hcl_file)
                gh.add_file(f"files/{identifier}",
                            get_file_contents(service, file["path"]))
                hcl_errors = validate_hcl(dbfs_file_hcl)
                if len(hcl_errors) > 0:
                    log.error(
                        f"Identified error in the following HCL Config: {dbfs_file_hcl}"
                    )
                    log.error(hcl_errors)
コード例 #2
0
def _get_dbfs_file_data_recrusive(service: DbfsService, path):
    resp = service.list(path)
    if "files" not in resp:
        return []
    files = resp["files"]
    output = []
    for file in files:
        if file["is_dir"] is True:
            output += _get_dbfs_file_data_recrusive(service, file["path"])
        else:
            output.append(file)
    return output
コード例 #3
0
def get_file_contents(dbfs_service: DbfsService,
                      dbfs_path: Text,
                      headers=None):
    abs_path = f"dbfs:{dbfs_path}"
    json = dbfs_service.get_status(abs_path, headers=headers)
    file_info = FileInfo.from_json(json)
    if file_info.is_dir:
        error_and_quit('The dbfs file {} is a directory.'.format(
            repr(abs_path)))
    length = file_info.file_size
    offset = 0
    output = io.StringIO()
    while offset < length:
        response = dbfs_service.read(abs_path,
                                     offset,
                                     BUFFER_SIZE_BYTES,
                                     headers=headers)
        bytes_read = response['bytes_read']
        data = response['data']
        offset += bytes_read
        output.write(b64decode(data).decode("utf-8"))
    return output.getvalue()
コード例 #4
0
ファイル: config.py プロジェクト: mengxr/databricks-cli
def get_dbfs_client():
    api_client = _get_api_client()
    return DbfsService(api_client)
コード例 #5
0
ファイル: api.py プロジェクト: snosrap/databricks-cli
 def __init__(self, api_client):
     self.client = DbfsService(api_client)
コード例 #6
0
ファイル: api.py プロジェクト: snosrap/databricks-cli
class DbfsApi(object):
    MULTIPART_UPLOAD_LIMIT = 2147483648

    def __init__(self, api_client):
        self.client = DbfsService(api_client)

    def list_files(self, dbfs_path, headers=None):
        list_response = self.client.list(dbfs_path.absolute_path, headers=headers)
        if 'files' in list_response:
            return [FileInfo.from_json(f) for f in list_response['files']]
        else:
            return []

    def file_exists(self, dbfs_path, headers=None):
        try:
            self.get_status(dbfs_path, headers=headers)
        except HTTPError as e:
            try:
                if e.response.json()['error_code'] == DbfsErrorCodes.RESOURCE_DOES_NOT_EXIST:
                    return False
            except ValueError:
                pass

            raise e
        return True

    def get_status(self, dbfs_path, headers=None):
        json = self.client.get_status(dbfs_path.absolute_path, headers=headers)
        return FileInfo.from_json(json)

    # Method makes multipart/form-data file upload for files <2GB.
    # Otherwise uses create, add-block, close methods for streaming upload.
    def put_file(self, src_path, dbfs_path, overwrite, headers=None):
        # If file size is >2Gb use streaming upload.
        if os.path.getsize(src_path) < self.MULTIPART_UPLOAD_LIMIT:
            self.client.put(dbfs_path.absolute_path, src_path=src_path,
                            overwrite=overwrite, headers=headers)
        else:
            handle = self.client.create(dbfs_path.absolute_path, overwrite,
                                        headers=headers)['handle']
            with open(src_path, 'rb') as local_file:
                while True:
                    contents = local_file.read(BUFFER_SIZE_BYTES)
                    if len(contents) == 0:
                        break
                    # add_block should not take a bytes object.
                    self.client.add_block(handle, b64encode(contents).decode(), headers=headers)
                self.client.close(handle, headers=headers)

    def get_file(self, dbfs_path, dst_path, overwrite, headers=None):
        if os.path.exists(dst_path) and not overwrite:
            raise LocalFileExistsException('{} exists already.'.format(dst_path))
        file_info = self.get_status(dbfs_path, headers=headers)
        if file_info.is_dir:
            error_and_quit(('The dbfs file {} is a directory.').format(repr(dbfs_path)))
        length = file_info.file_size
        offset = 0
        with open(dst_path, 'wb') as local_file:
            while offset < length:
                response = self.client.read(dbfs_path.absolute_path, offset, BUFFER_SIZE_BYTES,
                                            headers=headers)
                bytes_read = response['bytes_read']
                data = response['data']
                offset += bytes_read
                local_file.write(b64decode(data))

    @staticmethod
    def get_num_files_deleted(partial_delete_error):
        try:
            message = partial_delete_error.response.json()['message']
        except (AttributeError, KeyError):
            raise ParseException("Unable to retrieve the number of deleted files.")
        m = re.compile(r".*operation has deleted (\d+) files.*").match(message)
        if not m:
            raise ParseException(
                "Unable to retrieve the number of deleted files from the error message: {}".format(
                    message))
        return int(m.group(1))

    def delete(self, dbfs_path, recursive, headers=None):
        num_files_deleted = 0
        while True:
            try:
                self.client.delete(dbfs_path.absolute_path, recursive=recursive, headers=headers)
            except HTTPError as e:
                if e.response.status_code == 503:
                    try:
                        error_code = e.response.json()['error_code']
                    except (AttributeError, KeyError):
                        error_code = None
                    # Handle partial delete exceptions: retry until all the files have been deleted
                    if error_code == DbfsErrorCodes.PARTIAL_DELETE:
                        try:
                            num_files_deleted += DbfsApi.get_num_files_deleted(e)
                            click.echo("\rDeleted {} files. Delete in progress...\033[K".format(
                                num_files_deleted), nl=False)
                        except ParseException:
                            click.echo("\rDelete in progress...\033[K", nl=False)
                        continue
                click.echo("\rDeleted at least {} files but interrupted by error.\033[K".format(
                    num_files_deleted))
                raise e
            break
        click.echo("\rDelete finished successfully.\033[K")

    def mkdirs(self, dbfs_path, headers=None):
        self.client.mkdirs(dbfs_path.absolute_path, headers=headers)

    def move(self, dbfs_src, dbfs_dst, headers=None):
        self.client.move(dbfs_src.absolute_path, dbfs_dst.absolute_path, headers=headers)

    def _copy_to_dbfs_non_recursive(self, src, dbfs_path_dst, overwrite, headers=None):
        # Munge dst path in case dbfs_path_dst is a dir
        try:
            if self.get_status(dbfs_path_dst, headers=headers).is_dir:
                dbfs_path_dst = dbfs_path_dst.join(os.path.basename(src))
        except HTTPError as e:
            if e.response.json()['error_code'] == DbfsErrorCodes.RESOURCE_DOES_NOT_EXIST:
                pass
            else:
                raise e
        self.put_file(src, dbfs_path_dst, overwrite, headers=headers)

    def _copy_from_dbfs_non_recursive(self, dbfs_path_src, dst, overwrite, headers=None):
        # Munge dst path in case dst is a dir
        if os.path.isdir(dst):
            dst = os.path.join(dst, dbfs_path_src.basename)
        self.get_file(dbfs_path_src, dst, overwrite, headers=headers)

    def _copy_to_dbfs_recursive(self, src, dbfs_path_dst, overwrite, headers=None):
        try:
            self.mkdirs(dbfs_path_dst, headers=headers)
        except HTTPError as e:
            if e.response.json()['error_code'] == DbfsErrorCodes.RESOURCE_ALREADY_EXISTS:
                click.echo(e.response.json())
                return
        for filename in os.listdir(src):
            cur_src = os.path.join(src, filename)
            cur_dbfs_dst = dbfs_path_dst.join(filename)
            if os.path.isdir(cur_src):
                self._copy_to_dbfs_recursive(cur_src, cur_dbfs_dst, overwrite, headers=headers)
            elif os.path.isfile(cur_src):
                try:
                    self.put_file(cur_src, cur_dbfs_dst, overwrite, headers=headers)
                    click.echo('{} -> {}'.format(cur_src, cur_dbfs_dst))
                except HTTPError as e:
                    if e.response.json()['error_code'] == DbfsErrorCodes.RESOURCE_ALREADY_EXISTS:
                        click.echo('{} already exists. Skip.'.format(cur_dbfs_dst))
                    else:
                        raise e

    def _copy_from_dbfs_recursive(self, dbfs_path_src, dst, overwrite, headers=None):
        if os.path.isfile(dst):
            click.echo(
                '{} exists as a file. Skipping this subtree {}'.format(dst, repr(dbfs_path_src)))
            return
        elif not os.path.isdir(dst):
            os.makedirs(dst)

        for dbfs_src_file_info in self.list_files(dbfs_path_src, headers=headers):
            cur_dbfs_src = dbfs_src_file_info.dbfs_path
            cur_dst = os.path.join(dst, cur_dbfs_src.basename)
            if dbfs_src_file_info.is_dir:
                self._copy_from_dbfs_recursive(cur_dbfs_src, cur_dst, overwrite, headers=headers)
            else:
                try:
                    self.get_file(cur_dbfs_src, cur_dst, overwrite, headers=headers)
                    click.echo('{} -> {}'.format(cur_dbfs_src, cur_dst))
                except LocalFileExistsException:
                    click.echo(('{} already exists locally as {}. Skip. To overwrite, you ' +
                                'should provide the --overwrite flag.').format(cur_dbfs_src,
                                                                               cur_dst))

    def cp(self, recursive, overwrite, src, dst, headers=None):
        if not DbfsPath.is_valid(src) and DbfsPath.is_valid(dst):
            if not os.path.exists(src):
                error_and_quit('The local file {} does not exist.'.format(src))
            if not recursive:
                if os.path.isdir(src):
                    error_and_quit(
                        ('The local file {} is a directory. You must provide --recursive')
                        .format(src))
                self._copy_to_dbfs_non_recursive(src, DbfsPath(dst), overwrite, headers=headers)
            else:
                if not os.path.isdir(src):
                    self._copy_to_dbfs_non_recursive(src, DbfsPath(dst), overwrite, headers=headers)
                    return
                self._copy_to_dbfs_recursive(src, DbfsPath(dst), overwrite, headers=headers)
        # Copy from DBFS in this case
        elif DbfsPath.is_valid(src) and not DbfsPath.is_valid(dst):
            if not recursive:
                self._copy_from_dbfs_non_recursive(DbfsPath(src), dst, overwrite, headers=headers)
            else:
                dbfs_path_src = DbfsPath(src)
                if not self.get_status(dbfs_path_src, headers=headers).is_dir:
                    self._copy_from_dbfs_non_recursive(dbfs_path_src, dst, overwrite,
                                                       headers=headers)
                self._copy_from_dbfs_recursive(dbfs_path_src, dst, overwrite, headers=headers)
        elif not DbfsPath.is_valid(src) and not DbfsPath.is_valid(dst):
            error_and_quit('Both paths provided are from your local filesystem. '
                           'To use this utility, one of the src or dst must be prefixed '
                           'with dbfs:/')
        elif DbfsPath.is_valid(src) and DbfsPath.is_valid(dst):
            with TempDir() as temp_dir:
                # Always copy to <temp_dir>/temp since this will work no matter if it's a
                # recursive or a non-recursive copy.
                temp_path = temp_dir.path('temp')
                self.cp(recursive, True, src, temp_path)
                self.cp(recursive, overwrite, temp_path, dst)
        else:
            assert False, 'not reached'

    def cat(self, src):
        with TempDir() as temp_dir:
            temp_path = temp_dir.path('temp')
            self.cp(False, True, src, temp_path)
            with open(temp_path) as f:
                click.echo(f.read(), nl=False)
コード例 #7
0
class DbfsApi(object):
    def __init__(self, api_client):
        self.client = DbfsService(api_client)

    def list_files(self, dbfs_path, headers=None):
        list_response = self.client.list(dbfs_path.absolute_path, headers=headers)
        if 'files' in list_response:
            return [FileInfo.from_json(f) for f in list_response['files']]
        else:
            return []

    def file_exists(self, dbfs_path, headers=None):
        try:
            self.get_status(dbfs_path, headers=headers)
        except HTTPError as e:
            if e.response.json()['error_code'] == DbfsErrorCodes.RESOURCE_DOES_NOT_EXIST:
                return False
            raise e
        return True

    def get_status(self, dbfs_path, headers=None):
        json = self.client.get_status(dbfs_path.absolute_path, headers=headers)
        return FileInfo.from_json(json)

    def put_file(self, src_path, dbfs_path, overwrite, headers=None):
        handle = self.client.create(dbfs_path.absolute_path, overwrite, headers=headers)['handle']
        with open(src_path, 'rb') as local_file:
            while True:
                contents = local_file.read(BUFFER_SIZE_BYTES)
                if len(contents) == 0:
                    break
                # add_block should not take a bytes object.
                self.client.add_block(handle, b64encode(contents).decode(), headers=headers)
            self.client.close(handle, headers=headers)

    def get_file(self, dbfs_path, dst_path, overwrite, headers=None):
        if os.path.exists(dst_path) and not overwrite:
            raise LocalFileExistsException('{} exists already.'.format(dst_path))
        file_info = self.get_status(dbfs_path, headers=headers)
        if file_info.is_dir:
            error_and_quit(('The dbfs file {} is a directory.').format(repr(dbfs_path)))
        length = file_info.file_size
        offset = 0
        with open(dst_path, 'wb') as local_file:
            while offset < length:
                response = self.client.read(dbfs_path.absolute_path, offset, BUFFER_SIZE_BYTES,
                                            headers=headers)
                bytes_read = response['bytes_read']
                data = response['data']
                offset += bytes_read
                local_file.write(b64decode(data))

    def delete(self, dbfs_path, recursive, headers=None):
        self.client.delete(dbfs_path.absolute_path, recursive=recursive, headers=headers)

    def mkdirs(self, dbfs_path, headers=None):
        self.client.mkdirs(dbfs_path.absolute_path, headers=headers)

    def move(self, dbfs_src, dbfs_dst, headers=None):
        self.client.move(dbfs_src.absolute_path, dbfs_dst.absolute_path, headers=headers)

    def _copy_to_dbfs_non_recursive(self, src, dbfs_path_dst, overwrite, headers=None):
        # Munge dst path in case dbfs_path_dst is a dir
        try:
            if self.get_status(dbfs_path_dst, headers=headers).is_dir:
                dbfs_path_dst = dbfs_path_dst.join(os.path.basename(src))
        except HTTPError as e:
            if e.response.json()['error_code'] == DbfsErrorCodes.RESOURCE_DOES_NOT_EXIST:
                pass
            else:
                raise e
        self.put_file(src, dbfs_path_dst, overwrite, headers=headers)

    def _copy_from_dbfs_non_recursive(self, dbfs_path_src, dst, overwrite, headers=None):
        # Munge dst path in case dst is a dir
        if os.path.isdir(dst):
            dst = os.path.join(dst, dbfs_path_src.basename)
        self.get_file(dbfs_path_src, dst, overwrite, headers=headers)

    def _copy_to_dbfs_recursive(self, src, dbfs_path_dst, overwrite, headers=None):
        try:
            self.mkdirs(dbfs_path_dst, headers=headers)
        except HTTPError as e:
            if e.response.json()['error_code'] == DbfsErrorCodes.RESOURCE_ALREADY_EXISTS:
                click.echo(e.response.json())
                return
        for filename in os.listdir(src):
            cur_src = os.path.join(src, filename)
            cur_dbfs_dst = dbfs_path_dst.join(filename)
            if os.path.isdir(cur_src):
                self._copy_to_dbfs_recursive(cur_src, cur_dbfs_dst, overwrite, headers=headers)
            elif os.path.isfile(cur_src):
                try:
                    self.put_file(cur_src, cur_dbfs_dst, overwrite, headers=headers)
                    click.echo('{} -> {}'.format(cur_src, cur_dbfs_dst))
                except HTTPError as e:
                    if e.response.json()['error_code'] == DbfsErrorCodes.RESOURCE_ALREADY_EXISTS:
                        click.echo('{} already exists. Skip.'.format(cur_dbfs_dst))
                    else:
                        raise e

    def _copy_from_dbfs_recursive(self, dbfs_path_src, dst, overwrite, headers=None):
        if os.path.isfile(dst):
            click.echo(
                '{} exists as a file. Skipping this subtree {}'.format(dst, repr(dbfs_path_src)))
            return
        elif not os.path.isdir(dst):
            os.makedirs(dst)

        for dbfs_src_file_info in self.list_files(dbfs_path_src, headers=headers):
            cur_dbfs_src = dbfs_src_file_info.dbfs_path
            cur_dst = os.path.join(dst, cur_dbfs_src.basename)
            if dbfs_src_file_info.is_dir:
                self._copy_from_dbfs_recursive(cur_dbfs_src, cur_dst, overwrite, headers=headers)
            else:
                try:
                    self.get_file(cur_dbfs_src, cur_dst, overwrite, headers=headers)
                    click.echo('{} -> {}'.format(cur_dbfs_src, cur_dst))
                except LocalFileExistsException:
                    click.echo(('{} already exists locally as {}. Skip. To overwrite, you' +
                                'should provide the --overwrite flag.').format(cur_dbfs_src,
                                                                               cur_dst))

    def cp(self, recursive, overwrite, src, dst, headers=None):
        if not DbfsPath.is_valid(src) and DbfsPath.is_valid(dst):
            if not os.path.exists(src):
                error_and_quit('The local file {} does not exist.'.format(src))
            if not recursive:
                if os.path.isdir(src):
                    error_and_quit(
                        ('The local file {} is a directory. You must provide --recursive')
                        .format(src))
                self._copy_to_dbfs_non_recursive(src, DbfsPath(dst), overwrite, headers=headers)
            else:
                if not os.path.isdir(src):
                    self._copy_to_dbfs_non_recursive(src, DbfsPath(dst), overwrite, headers=headers)
                    return
                self._copy_to_dbfs_recursive(src, DbfsPath(dst), overwrite, headers=headers)
        # Copy from DBFS in this case
        elif DbfsPath.is_valid(src) and not DbfsPath.is_valid(dst):
            if not recursive:
                self._copy_from_dbfs_non_recursive(DbfsPath(src), dst, overwrite, headers=headers)
            else:
                dbfs_path_src = DbfsPath(src)
                if not self.get_status(dbfs_path_src, headers=headers).is_dir:
                    self._copy_from_dbfs_non_recursive(dbfs_path_src, dst, overwrite,
                                                       headers=headers)
                self._copy_from_dbfs_recursive(dbfs_path_src, dst, overwrite, headers=headers)
        elif not DbfsPath.is_valid(src) and not DbfsPath.is_valid(dst):
            error_and_quit('Both paths provided are from your local filesystem. '
                           'To use this utility, one of the src or dst must be prefixed '
                           'with dbfs:/')
        elif DbfsPath.is_valid(src) and DbfsPath.is_valid(dst):
            error_and_quit('Both paths provided are from the DBFS filesystem. '
                           'To copy between the DBFS filesystem, you currently must copy the '
                           'file from DBFS to your local filesystem and then back.')
        else:
            assert False, 'not reached'
コード例 #8
0
class DbfsApi(object):
    def __init__(self, api_client):
        self.client = DbfsService(api_client)

    def list_files(self, dbfs_path):
        list_response = self.client.list(dbfs_path.absolute_path)
        if 'files' in list_response:
            return [FileInfo.from_json(f) for f in list_response['files']]
        else:
            return []

    def file_exists(self, dbfs_path):
        try:
            self.get_status(dbfs_path)
        except HTTPError as e:
            if e.response.json(
            )['error_code'] == DbfsErrorCodes.RESOURCE_DOES_NOT_EXIST:
                return False
            raise e
        return True

    def get_status(self, dbfs_path):
        json = self.client.get_status(dbfs_path.absolute_path)
        return FileInfo.from_json(json)

    def put_file(self, src_path, dbfs_path, overwrite):
        handle = self.client.create(dbfs_path.absolute_path,
                                    overwrite)['handle']
        with open(src_path, 'rb') as local_file:
            while True:
                contents = local_file.read(BUFFER_SIZE_BYTES)
                if len(contents) == 0:
                    break
                # add_block should not take a bytes object.
                self.client.add_block(handle, b64encode(contents).decode())
            self.client.close(handle)

    def get_file(self, dbfs_path, dst_path, overwrite):
        if os.path.exists(dst_path) and not overwrite:
            raise LocalFileExistsException(
                '{} exists already.'.format(dst_path))
        file_info = self.get_status(dbfs_path)
        if file_info.is_dir:
            error_and_quit(
                ('The dbfs file {} is a directory.').format(repr(dbfs_path)))
        length = file_info.file_size
        offset = 0
        with open(dst_path, 'wb') as local_file:
            while offset < length:
                response = self.client.read(dbfs_path.absolute_path, offset,
                                            BUFFER_SIZE_BYTES)
                bytes_read = response['bytes_read']
                data = response['data']
                offset += bytes_read
                local_file.write(b64decode(data))

    def delete(self, dbfs_path, recursive):
        self.client.delete(dbfs_path.absolute_path, recursive=recursive)

    def mkdirs(self, dbfs_path):
        self.client.mkdirs(dbfs_path.absolute_path)

    def move(self, dbfs_src, dbfs_dst):
        self.client.move(dbfs_src.absolute_path, dbfs_dst.absolute_path)
コード例 #9
0
 def __init__(self,
              api_client,
              delete_retry_delay_millis=DELETE_503_RETRY_DELAY_MILLIS):
     self.client = DbfsService(api_client)
     self.delete_retry_delay_millis = delete_retry_delay_millis