Example #1
0
def _import_dir_helper(source_path, target_path, overwrite):
    # Try doing the os.listdir before creating the dir in Databricks.
    filenames = os.listdir(source_path)
    try:
        mkdirs(target_path)
    except HTTPError as e:
        click.echo(e.response.json())
        return
    for filename in filenames:
        cur_src = os.path.join(source_path, filename)
        cur_dst = os.path.join(target_path, filename)
        if os.path.isdir(cur_src):
            _import_dir_helper(cur_src, cur_dst, overwrite)
        elif os.path.isfile(cur_src):
            ext = WorkspaceLanguage.get_extension(cur_src)
            if ext != '':
                cur_dst = cur_dst.rstrip(ext)
                language = WorkspaceLanguage.to_language(cur_src)
                import_workspace(cur_src, cur_dst, language,
                                 WorkspaceFormat.SOURCE, overwrite)
                click.echo('{} -> {}'.format(cur_src, cur_dst))
            else:
                extensions = ', '.join(WorkspaceLanguage.EXTENSIONS)
                click.echo((
                    '{} does not have a valid extension of {}. Skip this file and '
                    + 'continue.').format(cur_src, extensions))
Example #2
0
 def import_workspace_dir(self, source_path, target_path, overwrite, exclude_hidden_files,
                          headers=None):
     # pylint: disable=too-many-locals
     filenames = os.listdir(source_path)
     if exclude_hidden_files:
         # for now, just exclude hidden files or directories based on starting '.'
         filenames = [f for f in filenames if not f.startswith('.')]
     try:
         self.mkdirs(target_path, headers=headers)
     except HTTPError as e:
         click.echo(e.response.json())
         return
     for filename in filenames:
         cur_src = os.path.join(source_path, filename)
         # don't use os.path.join here since it will set \ on Windows
         cur_dst = target_path.rstrip('/') + '/' + filename
         if os.path.isdir(cur_src):
             self.import_workspace_dir(cur_src, cur_dst, overwrite, exclude_hidden_files,
                                       headers=headers)
         elif os.path.isfile(cur_src):
             ext = WorkspaceLanguage.get_extension(cur_src)
             if ext != '':
                 cur_dst = cur_dst[:-len(ext)]
                 (language, file_format) = WorkspaceLanguage.to_language_and_format(cur_src)
                 self.import_workspace(cur_src, cur_dst, language, file_format, overwrite,
                                       headers=headers)
                 click.echo('{} -> {}'.format(cur_src, cur_dst))
             else:
                 extensions = ', '.join(WorkspaceLanguage.EXTENSIONS)
                 click.echo(('{} does not have a valid extension of {}. Skip this file and ' +
                             'continue.').format(cur_src, extensions))
Example #3
0
def _import_dir_helper(source_path, target_path, overwrite, exclude_hidden_files):
    # Try doing the os.listdir before creating the dir in Databricks.
    filenames = os.listdir(source_path)
    if exclude_hidden_files:
        # for now, just exclude hidden files or directories based on starting '.'
        filenames = [f for f in filenames if not f.startswith('.')]
    try:
        mkdirs(target_path)
    except HTTPError as e:
        click.echo(e.response.json())
        return
    for filename in filenames:
        cur_src = os.path.join(source_path, filename)
        # don't use os.path.join here since it will set \ on Windows
        cur_dst = target_path.rstrip('/') + '/' + filename
        if os.path.isdir(cur_src):
            _import_dir_helper(cur_src, cur_dst, overwrite, exclude_hidden_files)
        elif os.path.isfile(cur_src):
            ext = WorkspaceLanguage.get_extension(cur_src)
            if ext != '':
                cur_dst = cur_dst[:-len(ext)]
                (language, file_format) = WorkspaceLanguage.to_language_and_format(cur_src)
                import_workspace(cur_src, cur_dst, language, file_format, overwrite)
                click.echo('{} -> {}'.format(cur_src, cur_dst))
            else:
                extensions = ', '.join(WorkspaceLanguage.EXTENSIONS)
                click.echo(('{} does not have a valid extension of {}. Skip this file and ' +
                            'continue.').format(cur_src, extensions))
Example #4
0
 def export_workspace_dir(self,
                          source_path,
                          target_path,
                          format=WorkspaceFormat.SOURCE,
                          overwrite):
     if os.path.isfile(target_path):
         click.echo('{} exists as a file. Skipping this subtree {}'.format(
             target_path, source_path))
         return
     if not os.path.isdir(target_path):
         os.makedirs(target_path)
     for obj in self.list_objects(source_path):
         cur_src = obj.path
         cur_dst = os.path.join(target_path, obj.basename)
         if obj.is_dir:
             self.export_workspace_dir(cur_src, cur_dst, overwrite)
         elif obj.is_notebook:
             cur_dst = cur_dst + WorkspaceLanguage.to_extension(
                 obj.language)
             try:
                 self.export_workspace(cur_src, cur_dst, format, overwrite)
                 click.echo('{} -> {}'.format(cur_src, cur_dst))
             except LocalFileExistsException:
                 click.echo('{} already exists locally as {}. Skip.'.format(
                     cur_src, cur_dst))
         else:
             click.echo(
                 '{} is neither a dir or a notebook. Skip.'.format(cur_src))
Example #5
0
    def _download_workspace(self, resource_properties, overwrite):
        """
        Download workspace asset.

        :param resource_properties: dict of properties for the workspace asset. Must contain the
        'source_path', 'path' and 'object_type' fields.
        :param overwrite: Whether or not to overwrite the contents of workspace notebooks.
        """
        local_path = resource_properties.get(WORKSPACE_RESOURCE_SOURCE_PATH)
        workspace_path = resource_properties.get(WORKSPACE_RESOURCE_PATH)
        object_type = resource_properties.get(WORKSPACE_RESOURCE_OBJECT_TYPE)
        click.echo('Downloading {} from Databricks path {} to {}'.format(object_type,
                                                                         workspace_path,
                                                                         local_path))
        if object_type == NOTEBOOK:
            # Inference of notebook language and format. A tuple of (language, fmt) or Nonetype.
            language_fmt = WorkspaceLanguage.to_language_and_format(local_path)
            if language_fmt is None:
                raise StackError("Workspace Notebook language and format cannot be inferred."
                                 "Please check file extension of notebook 'source_path'.")
            (_, fmt) = language_fmt
            local_dir = os.path.dirname(os.path.abspath(local_path))
            if not os.path.exists(local_dir):
                os.makedirs(local_dir)
            self.workspace_client.export_workspace(workspace_path, local_path, fmt, overwrite)
        elif object_type == DIRECTORY:
            self.workspace_client.export_workspace_dir(workspace_path, local_path, overwrite)
        else:
            raise StackError("Invalid value for '{}' field: {}"
                             .format(WORKSPACE_RESOURCE_OBJECT_TYPE, object_type))
Example #6
0
    def _deploy_workspace(self, resource_properties, databricks_id, overwrite):
        """
        Deploy workspace asset.

        :param resource_properties: dict of properties for the workspace asset. Must contain the
        'source_path', 'path' and 'object_type' fields.
        :param databricks_id: dict containing physical identifier of workspace asset on databricks.
        Should contain the field 'path'.
        :param overwrite: Whether or not to overwrite the contents of workspace notebooks.
        :return: (dict, dict) of (databricks_id, deploy_output). databricks_id is the physical ID
        for the stack status that contains the workspace path of the notebook or directory on
        datbricks. deploy_output is the initial information about the asset on databricks at deploy
        time returned by the REST API.
        """
        local_path = resource_properties.get(WORKSPACE_RESOURCE_SOURCE_PATH)
        workspace_path = resource_properties.get(WORKSPACE_RESOURCE_PATH)
        object_type = resource_properties.get(WORKSPACE_RESOURCE_OBJECT_TYPE)

        actual_object_type = DIRECTORY if os.path.isdir(
            local_path) else NOTEBOOK
        if object_type != actual_object_type:
            raise StackError('Field "{}" ({}) not consistent '
                             'with actual object type ({})'.format(
                                 WORKSPACE_RESOURCE_OBJECT_TYPE, object_type,
                                 actual_object_type))

        click.echo('Uploading {} from {} to Databricks workspace at {}'.format(
            object_type, local_path, workspace_path))
        if object_type == NOTEBOOK:
            # Inference of notebook language and format
            language_fmt = WorkspaceLanguage.to_language_and_format(local_path)
            if language_fmt is None:
                raise StackError(
                    "Workspace notebook language and format cannot be inferred. "
                    "Please check file extension of notebook file.")
            language, fmt = language_fmt
            # Create needed directories in workspace.
            self.workspace_client.mkdirs(os.path.dirname(workspace_path))
            self.workspace_client.import_workspace(local_path, workspace_path,
                                                   language, fmt, overwrite)
        elif object_type == DIRECTORY:
            self.workspace_client.import_workspace_dir(
                local_path,
                workspace_path,
                overwrite,
                exclude_hidden_files=True)
        else:
            # Shouldn't reach here because of verification of object_type above.
            assert False

        if databricks_id and databricks_id[
                WORKSPACE_RESOURCE_PATH] != workspace_path:
            # databricks_id['path'] is the workspace path from the last deployment. Alert when
            # changed
            click.echo("Workspace asset had path changed from {} to {}".format(
                databricks_id[WORKSPACE_RESOURCE_PATH], workspace_path))
        new_databricks_id = {WORKSPACE_RESOURCE_PATH: workspace_path}
        deploy_output = self.workspace_client.client.get_status(workspace_path)

        return new_databricks_id, deploy_output
Example #7
0
    def _deploy_workspace(self, resource_properties, physical_id, overwrite):
        """
        Deploy workspace asset.

        :param resource_properties: dict of properties for the workspace asset. Must contain the
        'source_path' and 'path' fields. The other fields will be inferred if not provided.
        :param physical_id: dict containing physical identifier of workspace asset on databricks.
        Should contain the field 'path'.
        :param overwrite: Whether or not to overwrite the contents of workspace notebooks.
        :return: (dict, dict) of (physical_id, deploy_output). physical_id is the physical ID for
        the stack status that contains the workspace path of the notebook or directory on datbricks.
        deploy_output is the initial information about the asset on databricks at deploy time
        returned by the REST API.
        """
        # Required fields. TODO(alinxie) put in _validate_config
        local_path = resource_properties.get('source_path')
        workspace_path = resource_properties.get('path')
        object_type = resource_properties.get('object_type')

        actual_object_type = 'DIRECTORY' if os.path.isdir(
            local_path) else 'NOTEBOOK'
        if object_type != actual_object_type:
            raise StackError("Field 'object_type' ({}) not consistent"
                             "with actual object type ({})".format(
                                 object_type, actual_object_type))

        click.echo('Uploading {} from {} to Databricks workspace at {}'.format(
            object_type, local_path, workspace_path))
        if object_type == 'NOTEBOOK':
            # Inference of notebook language and format
            language_fmt = WorkspaceLanguage.to_language_and_format(local_path)
            if language_fmt is None:
                raise StackError(
                    "Workspace notebook language and format cannot be inferred"
                    "Please check file extension of notebook file.")
            language, fmt = language_fmt
            # Create needed directories in workspace.
            self.workspace_client.mkdirs(os.path.dirname(workspace_path))
            self.workspace_client.import_workspace(local_path, workspace_path,
                                                   language, fmt, overwrite)
        elif object_type == 'DIRECTORY':
            self.workspace_client.import_workspace_dir(
                local_path,
                workspace_path,
                overwrite,
                exclude_hidden_files=True)
        else:
            # Shouldn't reach here because of verification of object_type above.
            assert False

        if physical_id and physical_id['path'] != workspace_path:
            # physical_id['path'] is the workspace path from the last deployment. Alert when changed
            click.echo("Workspace asset had path changed from {} to {}".format(
                physical_id['path'], workspace_path))
        new_physical_id = {'path': workspace_path}
        deploy_output = self.workspace_client.client.get_status(workspace_path)

        return new_physical_id, deploy_output
Example #8
0
def export_workspace_cli(source_path, target_path, format, overwrite): # NOQA
    """
    Exports a notebook from the Databricks workspace.

    The format is by default SOURCE. Possible formats are SOURCE, HTML, JUPTYER, and DBC. Each
    format is documented at
    https://docs.databricks.com/api/latest/workspace.html#notebookexportformat.
    """
    if os.path.isdir(target_path):
        file_info = get_status(source_path)
        if not file_info.is_notebook:
            raise RuntimeError('Export can only be called on a notebook.')
        extension = WorkspaceLanguage.to_extension(file_info.language)
        target_path = os.path.join(target_path, file_info.basename + extension)
    export_workspace(source_path, target_path, format, overwrite) # NOQA
Example #9
0
 def export_workspace_dir(self,
                          source_path,
                          target_path,
                          overwrite,
                          headers=None):
     os_compatible_target_path = re.sub(LOCAL_OS_COMPATIBLE_PATH_REGEX, '_',
                                        target_path)
     if os.path.isfile(os_compatible_target_path):
         click.echo('{} exists as a file. Skipping this subtree {}'.format(
             os_compatible_target_path, source_path))
         return
     if not os.path.isdir(os_compatible_target_path):
         os.makedirs(os_compatible_target_path)
     for obj in self.list_objects(source_path, headers=headers):
         cur_src = obj.path
         cur_dst = os.path.join(os_compatible_target_path, obj.basename)
         if obj.is_dir:
             self.export_workspace_dir(cur_src,
                                       cur_dst,
                                       overwrite,
                                       headers=headers)
         elif obj.is_notebook:
             cur_dst = cur_dst + WorkspaceLanguage.to_extension(
                 obj.language)
             try:
                 self.export_workspace(cur_src,
                                       cur_dst,
                                       WorkspaceFormat.SOURCE,
                                       overwrite,
                                       headers=headers)
                 click.echo('{} -> {}'.format(cur_src, cur_dst))
             except LocalFileExistsException:
                 click.echo('{} already exists locally as {}. Skip.'.format(
                     cur_src, cur_dst))
         else:
             click.echo(
                 '{} is neither a dir or a notebook. Skip.'.format(cur_src))