コード例 #1
0
def clean_source(cfg: Dict[str, str]) -> NoReturn:
    """
    The guts of the "clean-source" command, this function deletes the source
    notebooks for current course from the remote Databricks instance.

    :param cfg: The config. COURSE_NAME, COURSE_REMOTE_SOURCE, and DB_PROFILE
                are assumed to be set.

    :return: Nothing
    """
    check_config(cfg)
    db_profile = cfg['DB_PROFILE']
    remote_source = cfg['COURSE_REMOTE_SOURCE']

    w = databricks.Workspace(profile=db_profile)
    w.mkdirs(remote_source)
    w.rm(remote_source, recursive=True)
コード例 #2
0
def update_config(cfg: Dict[str, str]) -> Dict[str, str]:
    """
    Update the configuration, setting values that depend on course name,
    which is assumed to be set in the configuration.

    :param cfg: current configuration

    :return: possibly adjusted configuration
    :raises CourseError: Configuration error.
    """
    course = cfg.get('COURSE_NAME')
    if not course:
        return cfg

    from os.path import join, normpath

    adj = cfg.copy()
    repo = adj['COURSE_REPO']

    self_paced = list(get_self_paced_courses(cfg))
    prefix = 'Self-Paced' if course in self_paced else ''

    adj['PREFIX'] = prefix
    adj['COURSE_HOME'] = normpath(join(repo, 'courses', prefix, course))
    if not adj.get('COURSE_YAML'):
        adj['COURSE_YAML'] = join(adj['COURSE_HOME'], 'build.yaml')
    adj['COURSE_MODULES'] = join(repo, 'modules', prefix, course)

    db_shard_home = adj.get('DB_SHARD_HOME')
    if not db_shard_home:
        # Let the databricks Workspace layer figure out the appropriate value
        # for home.
        try:
            w = databricks.Workspace(adj['DB_PROFILE'])
            db_shard_home = w.home
        except databricks.DatabricksError as e:
            # Ignore config errors. ~/.databrickscfg might not be there.
            if e.code != databricks.StatusCode.CONFIG_ERROR:
                raise

    if db_shard_home:
        adj['COURSE_REMOTE_SOURCE'] = f'{db_shard_home}/{adj["SOURCE"]}/{course}'
        adj['COURSE_REMOTE_TARGET'] = f'{db_shard_home}/{adj["TARGET"]}/{course}'

    return adj
コード例 #3
0
    def import_dbc(dbc: str, build: bdc.BuildData) -> NoReturn:
        '''
        Import a single DBC.

        Assumes (a) the working directory is the build directory, and
        (b) that the remote target path has already been created.
        '''
        w = databricks.Workspace(profile=db_profile)
        if build.has_profiles:
            parent_subpath = os.path.dirname(dbc)
            dir_to_make = f'{remote_target}/{os.path.dirname(parent_subpath)}'
            w.mkdirs(dir_to_make)
            remote_path = f'{remote_target}/{parent_subpath}'
        else:
            remote_path = remote_target

        info(f'Importing "{dbc}" to "{remote_path}"...')
        w.import_dbc(dbc, remote_path)
コード例 #4
0
def clean(cfg: Dict[str, str]) -> NoReturn:
    """
    The guts of the "clean" command, this function deletes the built (target)
    notebooks for current course from the remote Databricks instance.

    :param cfg: The config. COURSE_NAME, COURSE_REMOTE_TARGET, and DB_PROFILE
                are assumed to be set.

    :return: Nothing
    """
    check_config(cfg)
    db_profile = cfg['DB_PROFILE']
    remote_target = cfg['COURSE_REMOTE_TARGET']

    # It's odd to ensure that the directory exists before removing it, but
    # it's easier (and costs no more time, really) than to issue a REST call
    # to check whether it exists in the first place. And "rm" will die if
    # called on a nonexistent remote path.
    w = databricks.Workspace(profile=db_profile)
    w.mkdirs(remote_target)
    w.rm(remote_target, recursive=True)
コード例 #5
0
def import_dbcs(cfg: Dict[str, str], build_dir: str,
                build_file: str) -> NoReturn:
    """
    Find all DBC files under the build output directory for the current course,
    and upload them (import them) into the Databricks instance.

    :param cfg:       The config. COURSE_NAME, COURSE_REMOTE_TARGET, and
                      DB_PROFILE are assumed to be set.
    :param build_dir: The path to the build directory.

    :return: NOthing
    """
    check_config(cfg)
    remote_target = cfg['COURSE_REMOTE_TARGET']
    db_profile = cfg['DB_PROFILE']

    def import_dbc(dbc: str, build: bdc.BuildData) -> NoReturn:
        '''
        Import a single DBC.

        Assumes (a) the working directory is the build directory, and
        (b) that the remote target path has already been created.
        '''
        w = databricks.Workspace(profile=db_profile)
        if build.has_profiles:
            parent_subpath = os.path.dirname(dbc)
            dir_to_make = f'{remote_target}/{os.path.dirname(parent_subpath)}'
            w.mkdirs(dir_to_make)
            remote_path = f'{remote_target}/{parent_subpath}'
        else:
            remote_path = remote_target

        info(f'Importing "{dbc}" to "{remote_path}"...')
        w.import_dbc(dbc, remote_path)

    # Get the build information. We'll need it later.
    build = bdc.bdc_load_build(build_file)

    print(
        f'Importing all DBCs under "{build_dir}" to remote "{remote_target}"')
    dbcs = []
    with working_directory(build_dir) as pwd:
        for dirpath, _, filenames in os.walk('.'):
            for filename in filenames:
                _, ext = os.path.splitext(filename)
                if ext != '.dbc':
                    continue
                dbcs.append(os.path.normpath(os.path.join(dirpath, filename)))

        if not dbcs:
            warn('No DBCs found.')
        else:
            clean(cfg)
            w = databricks.Workspace(profile=db_profile)
            # If we're doing a profile-based build, create the remote target.
            # The import operations will implicitly create the remote
            # subfolders. However, if we're not doing profile-based builds,
            # then creating the remote target ahead of time will cause the
            # import to fail, so don't do that.
            if build.has_profiles:
                w.mkdirs(remote_target)

            for dbc in dbcs:
                info(f'\nIn "{pwd}":')
                import_dbc(dbc, build)