def import_current_workspace_items(self, artifact_dir='artifacts/'):
     src_dir = self.get_export_dir() + artifact_dir
     error_logger = logging_utils.get_error_logger(
         wmconstants.WM_IMPORT, wmconstants.WORKSPACE_NOTEBOOK_OBJECT,
         self.get_export_dir())
     for root, subdirs, files in self.walk(src_dir):
         # replace the local directory with empty string to get the notebook workspace directory
         nb_dir = '/' + root.replace(src_dir, '')
         upload_dir = nb_dir
         if not nb_dir == '/':
             upload_dir = nb_dir + '/'
         if not self.does_path_exist(upload_dir):
             resp_mkdirs = self.post(WS_MKDIRS, {'path': upload_dir})
             if 'error_code' in resp_mkdirs:
                 logging_utils.log_reponse_error(error_logger, resp_mkdirs)
         for f in files:
             logging.info("Uploading: {0}".format(f))
             # create the local file path to load the DBC file
             local_file_path = os.path.join(root, f)
             # create the ws full file path including filename
             ws_file_path = upload_dir + f
             # generate json args with binary data for notebook to upload to the workspace path
             nb_input_args = self.get_user_import_args(
                 local_file_path, ws_file_path)
             # call import to the workspace
             if self.is_verbose():
                 logging.info("Path: {0}".format(nb_input_args['path']))
             resp_upload = self.post(WS_IMPORT, nb_input_args)
             if 'error_code' in resp_upload:
                 resp_upload['path'] = nb_input_args['path']
                 logging_utils.log_reponse_error(error_logger, resp_upload)
 def import_instance_profiles(self, log_file='instance_profiles.log'):
     # currently an AWS only operation
     error_logger = logging_utils.get_error_logger(
         wmconstants.WM_IMPORT, wmconstants.INSTANCE_PROFILE_OBJECT,
         self.get_export_dir())
     ip_log = self.get_export_dir() + log_file
     if not os.path.exists(ip_log):
         logging.info("No instance profiles to import.")
         return
     # check current profiles and skip if the profile already exists
     ip_list = self.get('/instance-profiles/list').get(
         'instance_profiles', None)
     if ip_list:
         list_of_profiles = [x['instance_profile_arn'] for x in ip_list]
     else:
         list_of_profiles = []
     import_profiles_count = 0
     with open(ip_log, "r") as fp:
         for line in fp:
             ip_arn = json.loads(line).get('instance_profile_arn', None)
             if ip_arn not in list_of_profiles:
                 print("Importing arn: {0}".format(ip_arn))
                 resp = self.post('/instance-profiles/add',
                                  {'instance_profile_arn': ip_arn})
                 if not logging_utils.log_reponse_error(error_logger, resp):
                     import_profiles_count += 1
             else:
                 logging.info(
                     "Skipping since profile already exists: {0}".format(
                         ip_arn))
     return import_profiles_count
 def log_all_secrets(self, cluster_name=None, log_dir='secret_scopes/'):
     scopes_dir = self.get_export_dir() + log_dir
     scopes_list = self.get_secret_scopes_list()
     error_logger = logging_utils.get_error_logger(
         wmconstants.WM_EXPORT, wmconstants.SECRET_OBJECT,
         self.get_export_dir())
     os.makedirs(scopes_dir, exist_ok=True)
     start = timer()
     cid = self.start_cluster_by_name(
         cluster_name) if cluster_name else self.launch_cluster()
     time.sleep(5)
     ec_id = self.get_execution_context(cid)
     for scope_json in scopes_list:
         scope_name = scope_json.get('name')
         secrets_list = self.get_secrets(scope_name)
         if logging_utils.log_reponse_error(error_logger, secrets_list):
             continue
         scopes_logfile = scopes_dir + scope_name
         try:
             with open(scopes_logfile, 'w') as fp:
                 for secret_json in secrets_list:
                     secret_name = secret_json.get('key')
                     b64_value = self.get_secret_value(
                         scope_name, secret_name, cid, ec_id, error_logger)
                     s_json = {'name': secret_name, 'value': b64_value}
                     fp.write(json.dumps(s_json) + '\n')
         except ValueError as error:
             if "embedded null byte" in str(error):
                 error_msg = f"{scopes_logfile} has bad name and hence cannot open: {str(error)} Skipping.."
                 logging.error(error_msg)
                 error_logger.error(error_msg)
             else:
                 raise error
Exemple #4
0
    def import_mlflow_experiments_acls(
            self,
            acl_log='mlflow_experiments_acls.log',
            experiment_id_map_log='mlflow_experiments_id_map.log',
            num_parallel=4):
        """
        Import all experiments' permissions which are already exported in acl_log file. Finds out the new_experiment_id
        by looking up experiment_id_map_log file.

        While the permissions are persisted, the original creator (tagged as Created By label) is not persisted.
        The creator will always be set as the caller of this script.
        """
        experiment_id_map = self._load_experiment_id_map(self.export_dir +
                                                         experiment_id_map_log)
        acl_log_file = self.get_export_dir() + acl_log
        error_logger = logging_utils.get_error_logger(
            wmconstants.WM_IMPORT,
            wmconstants.MLFLOW_EXPERIMENT_PERMISSION_OBJECT,
            self.get_export_dir())
        checkpoint_key_set = self._checkpoint_service.get_checkpoint_key_set(
            wmconstants.WM_IMPORT,
            wmconstants.MLFLOW_EXPERIMENT_PERMISSION_OBJECT)
        start = timer()
        with open(acl_log_file, 'r') as fp:
            with ThreadPoolExecutor(max_workers=num_parallel) as executor:
                futures = [
                    executor.submit(self._put_mlflow_experiment_acl, acl_str,
                                    experiment_id_map, checkpoint_key_set,
                                    error_logger) for acl_str in fp
                ]
                concurrent.futures.wait(futures, return_when="FIRST_EXCEPTION")
                propagate_exceptions(futures)
        end = timer()
        logging.info("Complete MLflow Experiments Permissions Import Time: " +
                     str(timedelta(seconds=end - start)))
    def import_workspace_acls(self,
                              workspace_log_file='acl_notebooks.log',
                              dir_log_file='acl_directories.log',
                              num_parallel=1):
        """
        import the notebook and directory acls by looping over notebook and dir logfiles
        """
        dir_acl_logs = self.get_export_dir() + dir_log_file
        notebook_acl_logs = self.get_export_dir() + workspace_log_file
        acl_notebooks_error_logger = logging_utils.get_error_logger(
            wmconstants.WM_IMPORT, wmconstants.WORKSPACE_NOTEBOOK_ACL_OBJECT,
            self.get_export_dir())

        checkpoint_notebook_acl_set = self._checkpoint_service.get_checkpoint_key_set(
            wmconstants.WM_IMPORT, wmconstants.WORKSPACE_NOTEBOOK_ACL_OBJECT)
        with open(notebook_acl_logs) as nb_acls_fp:
            with ThreadPoolExecutor(max_workers=num_parallel) as executor:
                futures = [
                    executor.submit(self.apply_acl_on_object, nb_acl_str,
                                    acl_notebooks_error_logger,
                                    checkpoint_notebook_acl_set)
                    for nb_acl_str in nb_acls_fp
                ]
                concurrent.futures.wait(futures, return_when="FIRST_EXCEPTION")
                propagate_exceptions(futures)

        acl_dir_error_logger = logging_utils.get_error_logger(
            wmconstants.WM_IMPORT, wmconstants.WORKSPACE_DIRECTORY_ACL_OBJECT,
            self.get_export_dir())
        checkpoint_dir_acl_set = self._checkpoint_service.get_checkpoint_key_set(
            wmconstants.WM_IMPORT, wmconstants.WORKSPACE_DIRECTORY_ACL_OBJECT)

        with open(dir_acl_logs) as dir_acls_fp:
            with ThreadPoolExecutor(max_workers=num_parallel) as executor:
                futures = [
                    executor.submit(self.apply_acl_on_object, dir_acl_str,
                                    acl_dir_error_logger,
                                    checkpoint_dir_acl_set)
                    for dir_acl_str in dir_acls_fp
                ]
                concurrent.futures.wait(futures, return_when="FIRST_EXCEPTION")
                propagate_exceptions(futures)
        print("Completed import ACLs of Notebooks and Directories")
    def log_all_workspace_acls(self,
                               workspace_log_file='user_workspace.log',
                               dir_log_file='user_dirs.log',
                               num_parallel=4):
        """
        loop through all notebooks and directories to store their associated ACLs
        :param workspace_log_file: input file for user notebook listing
        :param dir_log_file: input file for user directory listing
        """
        # define log file names for notebooks, folders, and libraries
        logging.info("Exporting the notebook permissions")
        start = timer()
        acl_notebooks_error_logger = logging_utils.get_error_logger(
            wmconstants.WM_EXPORT, wmconstants.WORKSPACE_NOTEBOOK_ACL_OBJECT,
            self.get_export_dir())
        acl_notebooks_writer = ThreadSafeWriter(
            self.get_export_dir() + "acl_notebooks.log", "w")
        try:
            self.log_acl_to_file('notebooks', workspace_log_file,
                                 acl_notebooks_writer,
                                 acl_notebooks_error_logger, num_parallel)
        finally:
            acl_notebooks_writer.close()
        end = timer()
        logging.info("Complete Notebook ACLs Export Time: " +
                     str(timedelta(seconds=end - start)))

        logging.info("Exporting the directories permissions")
        start = timer()
        acl_directory_error_logger = logging_utils.get_error_logger(
            wmconstants.WM_EXPORT, wmconstants.WORKSPACE_DIRECTORY_ACL_OBJECT,
            self.get_export_dir())
        acl_directory_writer = ThreadSafeWriter(
            self.get_export_dir() + "acl_directories.log", "w")
        try:
            self.log_acl_to_file('directories', dir_log_file,
                                 acl_directory_writer,
                                 acl_directory_error_logger, num_parallel)
        finally:
            acl_directory_writer.close()
        end = timer()
        logging.info("Complete Directories ACLs Export Time: " +
                     str(timedelta(seconds=end - start)))
    def import_cluster_policies(self,
                                log_file='cluster_policies.log',
                                acl_log_file='acl_cluster_policies.log'):
        policies_log = self.get_export_dir() + log_file
        acl_policies_log = self.get_export_dir() + acl_log_file
        error_logger = logging_utils.get_error_logger(
            wmconstants.WM_IMPORT, wmconstants.CLUSTER_OBJECT,
            self.get_export_dir())
        checkpoint_cluster_policies_set = self._checkpoint_service.get_checkpoint_key_set(
            wmconstants.WM_IMPORT, wmconstants.CLUSTER_OBJECT)
        # create the policies
        if os.path.exists(policies_log):
            with open(policies_log, 'r') as policy_fp:
                for p in policy_fp:
                    policy_conf = json.loads(p)
                    if 'policy_id' in policy_conf and checkpoint_cluster_policies_set.contains(
                            policy_conf['policy_id']):
                        continue
                    # when creating the policy, we only need `name` and `definition` fields
                    create_args = {
                        'name': policy_conf['name'],
                        'definition': policy_conf['definition']
                    }
                    resp = self.post('/policies/clusters/create', create_args)
                    ignore_error_list = ['INVALID_PARAMETER_VALUE']
                    if not logging_utils.log_reponse_error(
                            error_logger,
                            resp,
                            ignore_error_list=ignore_error_list):
                        if 'policy_id' in policy_conf:
                            checkpoint_cluster_policies_set.write(
                                policy_conf['policy_id'])

            # ACLs are created by using the `access_control_list` key
            with open(acl_policies_log, 'r') as acl_fp:
                id_map = self.get_policy_id_by_name_dict()
                for x in acl_fp:
                    p_acl = json.loads(x)
                    if 'object_id' in p_acl and checkpoint_cluster_policies_set.contains(
                            p_acl['object_id']):
                        continue
                    acl_create_args = {
                        'access_control_list':
                        self.build_acl_args(p_acl['access_control_list'])
                    }
                    policy_id = id_map[p_acl['name']]
                    api = f'/permissions/cluster-policies/{policy_id}'
                    resp = self.put(api, acl_create_args)
                    if not logging_utils.log_reponse_error(error_logger, resp):
                        if 'object_id' in p_acl:
                            checkpoint_cluster_policies_set.write(
                                p_acl['object_id'])
        else:
            logging.info('Skipping cluster policies as no log file exists')
Exemple #8
0
 def export_database(self,
                     db_name,
                     cluster_name=None,
                     iam_role=None,
                     metastore_dir='metastore/',
                     success_log='success_metastore.log',
                     has_unicode=False,
                     db_log='database_details.log'):
     """
     :param db_name:  database name
     :param cluster_name: cluster to run against if provided
     :param iam_role: iam role to launch the cluster with
     :param metastore_dir: directory to store all the metadata
     :param has_unicode: whether the metadata has unicode characters to export
     :param db_log: specific database properties logfile
     :return:
     """
     # check if instance profile exists, ask users to use --users first or enter yes to proceed.
     start = timer()
     if cluster_name:
         cid = self.start_cluster_by_name(cluster_name)
         current_iam = self.get_iam_role_by_cid(cid)
     else:
         current_iam = iam_role
         cid = self.launch_cluster(current_iam)
     end = timer()
     logging.info("Cluster creation time: " +
                  str(timedelta(seconds=end - start)))
     time.sleep(5)
     ec_id = self.get_execution_context(cid)
     checkpoint_metastore_set = self._checkpoint_service.get_checkpoint_key_set(
         wmconstants.WM_EXPORT, wmconstants.METASTORE_TABLES)
     # if metastore failed log path exists, cleanup before re-running
     error_logger = logging_utils.get_error_logger(
         wmconstants.WM_EXPORT, wmconstants.METASTORE_TABLES,
         self.get_export_dir())
     success_metastore_log_path = self.get_export_dir() + success_log
     if os.path.exists(success_metastore_log_path):
         os.remove(success_metastore_log_path)
     database_logfile = self.get_export_dir() + db_log
     resp = self.set_desc_database_helper(cid, ec_id)
     if self.is_verbose():
         logging.info(resp)
     with open(database_logfile, 'w') as fp:
         db_json = self.get_desc_database_details(db_name, cid, ec_id)
         fp.write(json.dumps(db_json) + '\n')
     os.makedirs(self.get_export_dir() + metastore_dir + db_name,
                 exist_ok=True)
     self.log_all_tables(db_name, cid, ec_id, metastore_dir, error_logger,
                         success_metastore_log_path, current_iam,
                         checkpoint_metastore_set, has_unicode)
Exemple #9
0
    def import_all_groups(self, group_log_dir='groups/'):
        group_error_logger = logging_utils.get_error_logger(
            wmconstants.WM_IMPORT, wmconstants.GROUP_OBJECT, self.get_export_dir())
        group_dir = self.get_export_dir() + group_log_dir
        current_user_ids = self.get_user_id_mapping()
        self.import_groups(group_dir, current_user_ids, group_error_logger)
        # assign the users to IAM roles if on AWS
        if self.is_aws():
            logging.info("Update group role assignments")
            self.assign_group_roles(group_dir, group_error_logger)

        # need to separate role assignment and entitlements to support Azure
        logging.info("Updating groups entitlements")
        self.assign_group_entitlements(group_dir, group_error_logger)
 def log_all_secrets_acls(self, log_name='secret_scopes_acls.log'):
     acls_file = self.get_export_dir() + log_name
     error_logger = logging_utils.get_error_logger(
         wmconstants.WM_EXPORT, wmconstants.SECRET_OBJECT,
         self.get_export_dir())
     scopes_list = self.get_secret_scopes_list()
     with open(acls_file, 'w') as fp:
         for scope_json in scopes_list:
             scope_name = scope_json.get('name', None)
             resp = self.get('/secrets/acls/list', {'scope': scope_name})
             if logging_utils.log_reponse_error(error_logger, resp):
                 return
             else:
                 resp['scope_name'] = scope_name
                 fp.write(json.dumps(resp) + '\n')
 def import_instance_pools(self, log_file='instance_pools.log'):
     pool_log = self.get_export_dir() + log_file
     error_logger = logging_utils.get_error_logger(
         wmconstants.WM_IMPORT, wmconstants.INSTANCE_POOL_OBJECT,
         self.get_export_dir())
     if not os.path.exists(pool_log):
         logging.info("No instance pools to import.")
         return
     with open(pool_log, 'r') as fp:
         for line in fp:
             pool_conf = json.loads(line)
             pool_resp = self.post('/instance-pools/create', pool_conf)
             ignore_error_list = ['INVALID_PARAMETER_VALUE']
             logging_utils.log_reponse_error(
                 error_logger,
                 pool_resp,
                 ignore_error_list=ignore_error_list)
Exemple #12
0
    def import_all_users(self, user_log_file='users.log', num_parallel=4):
        checkpoint_users_set = self._checkpoint_service.get_checkpoint_key_set(
            wmconstants.WM_IMPORT, wmconstants.USER_OBJECT)
        user_log = self.get_export_dir() + user_log_file
        user_error_logger = logging_utils.get_error_logger(
            wmconstants.WM_IMPORT, wmconstants.USER_OBJECT, self.get_export_dir())

        self.import_users(user_log, user_error_logger, checkpoint_users_set, num_parallel)
        current_user_ids = self.get_user_id_mapping()
        self.log_failed_users(current_user_ids, user_log, user_error_logger)
        # assign the users to IAM roles if on AWS
        if self.is_aws():
            logging.info("Update user role assignments")
            self.assign_user_roles(current_user_ids, user_error_logger, user_log_file)

        # need to separate role assignment and entitlements to support Azure
        logging.info("Updating users entitlements")
        self.assign_user_entitlements(current_user_ids, user_error_logger, user_log_file)
Exemple #13
0
    def export_mlflow_runs(self,
                           start_date,
                           log_sql_file='mlflow_runs.db',
                           experiment_log='mlflow_experiments.log',
                           num_parallel=4):
        """
        Exports the Mlflow run objects. This can be run only after export_mlflow_experiments is complete.
        Unlike other objects, we save the data into sqlite tables, given the possible scale of runs objects.
        """
        experiments_logfile = self.export_dir + experiment_log
        mlflow_runs_checkpointer = self._checkpoint_service.get_checkpoint_key_set(
            wmconstants.WM_EXPORT, wmconstants.MLFLOW_RUNS)
        start = timer()
        con = sqlite3.connect(self.export_dir + log_sql_file)
        with con:
            con.execute('''
              CREATE TABLE IF NOT EXISTS runs (id TEXT UNIQUE, start_time INT, run_obj TEXT)
            ''')
        con.close()

        error_logger = logging_utils.get_error_logger(
            wmconstants.WM_EXPORT, wmconstants.MLFLOW_RUN_OBJECT,
            self.export_dir)
        start_date = start_date if start_date else datetime.now() - timedelta(
            days=30)
        start_time_epoch_ms = start_date.timestamp() * 1000
        with open(experiments_logfile, 'r') as fp:
            with ThreadPoolExecutor(max_workers=num_parallel) as executor:
                futures = [
                    executor.submit(self._export_runs_in_an_experiment,
                                    start_time_epoch_ms, log_sql_file,
                                    experiment_str, mlflow_runs_checkpointer,
                                    error_logger) for experiment_str in fp
                ]
                results = concurrent.futures.wait(
                    futures, return_when="FIRST_EXCEPTION")
                for result in results.done:
                    if result.exception() is not None:
                        raise result.exception()

        end = timer()
        logging.info("Complete MLflow Runs Export Time: " +
                     str(timedelta(seconds=end - start)))
Exemple #14
0
 def log_job_configs(self,
                     users_list=[],
                     log_file='jobs.log',
                     acl_file='acl_jobs.log'):
     """
     log all job configs and the ACLs for each job
     :param users_list: a list of users / emails to filter the results upon (optional for group exports)
     :param log_file: log file to store job configs as json entries per line
     :param acl_file: log file to store job ACLs
     :return:
     """
     jobs_log = self.get_export_dir() + log_file
     acl_jobs_log = self.get_export_dir() + acl_file
     error_logger = logging_utils.get_error_logger(wmconstants.WM_EXPORT,
                                                   wmconstants.JOB_OBJECT,
                                                   self.get_export_dir())
     # pinned by cluster_user is a flag per cluster
     jl_full = self.get_jobs_list(False)
     if users_list:
         # filter the jobs list to only contain users that exist within this list
         jl = list(
             filter(lambda x: x.get('creator_user_name', '') in users_list,
                    jl_full))
     else:
         jl = jl_full
     with open(jobs_log, "w") as log_fp, open(acl_jobs_log, 'w') as acl_fp:
         for x in jl:
             job_id = x['job_id']
             new_job_name = x['settings']['name'] + ':::' + str(job_id)
             # grab the settings obj
             job_settings = x['settings']
             # update the job name
             job_settings['name'] = new_job_name
             # reset the original struct with the new settings
             x['settings'] = job_settings
             log_fp.write(json.dumps(x) + '\n')
             job_perms = self.get(f'/preview/permissions/jobs/{job_id}')
             if not logging_utils.log_reponse_error(error_logger,
                                                    job_perms):
                 job_perms['job_name'] = new_job_name
                 acl_fp.write(json.dumps(job_perms) + '\n')
Exemple #15
0
    def export_mlflow_experiments_acls(
            self,
            experiment_log='mlflow_experiments.log',
            acl_log_file='mlflow_experiments_acls.log',
            num_parallel=4):
        """
        Export all experiments' permissions of already exported experiment objects logged in experiment_log file.
        :return: writes the result to acl_log_file
        """
        experiments_logfile = self.export_dir + experiment_log
        acl_log_file_writer = ThreadSafeWriter(self.export_dir + acl_log_file,
                                               'a')
        error_logger = logging_utils.get_error_logger(
            wmconstants.WM_EXPORT,
            wmconstants.MLFLOW_EXPERIMENT_PERMISSION_OBJECT,
            self.get_export_dir())
        checkpoint_key_set = self._checkpoint_service.get_checkpoint_key_set(
            wmconstants.WM_EXPORT,
            wmconstants.MLFLOW_EXPERIMENT_PERMISSION_OBJECT)

        start = timer()
        try:
            with open(experiments_logfile, 'r') as fp:
                with ThreadPoolExecutor(max_workers=num_parallel) as executor:
                    futures = [
                        executor.submit(self._get_mlflow_experiment_acls,
                                        acl_log_file_writer, experiment_str,
                                        checkpoint_key_set, error_logger)
                        for experiment_str in fp
                    ]
                    concurrent.futures.wait(futures,
                                            return_when="FIRST_EXCEPTION")
                    propagate_exceptions(futures)
        finally:
            acl_log_file_writer.close()
        end = timer()
        logging.info("Complete MLflow Experiments Permissions Export Time: " +
                     str(timedelta(seconds=end - start)))
Exemple #16
0
    def import_mlflow_experiments(self,
                                  log_file='mlflow_experiments.log',
                                  id_map_file='mlflow_experiments_id_map.log',
                                  log_dir=None,
                                  num_parallel=4):
        mlflow_experiments_dir = log_dir if log_dir else self.export_dir
        experiments_logfile = mlflow_experiments_dir + log_file
        experiments_id_map_file = mlflow_experiments_dir + id_map_file

        error_logger = logging_utils.get_error_logger(
            wmconstants.WM_IMPORT, wmconstants.MLFLOW_EXPERIMENT_OBJECT,
            self.export_dir)
        mlflow_experiments_checkpointer = self._checkpoint_service.get_checkpoint_key_set(
            wmconstants.WM_IMPORT, wmconstants.MLFLOW_EXPERIMENT_OBJECT)
        start = timer()

        id_map_thread_safe_writer = ThreadSafeWriter(experiments_id_map_file,
                                                     'a')

        try:
            with open(experiments_logfile, 'r') as fp:
                with ThreadPoolExecutor(max_workers=num_parallel) as executor:
                    futures = [
                        executor.submit(self._create_experiment,
                                        experiment_str,
                                        id_map_thread_safe_writer,
                                        mlflow_experiments_checkpointer,
                                        error_logger) for experiment_str in fp
                    ]
                    concurrent.futures.wait(futures,
                                            return_when="FIRST_EXCEPTION")
                    propagate_exceptions(futures)
        finally:
            id_map_thread_safe_writer.close()

        end = timer()
        logging.info("Complete MLflow Experiments Import Time: " +
                     str(timedelta(seconds=end - start)))
 def download_notebooks(self,
                        ws_log_file='user_workspace.log',
                        ws_dir='artifacts/',
                        num_parallel=4):
     """
     Loop through all notebook paths in the logfile and download individual notebooks
     :param ws_log_file: logfile for all notebook paths in the workspace
     :param ws_dir: export directory to store all notebooks
     :return: None
     """
     checkpoint_notebook_set = self._checkpoint_service.get_checkpoint_key_set(
         wmconstants.WM_EXPORT, wmconstants.WORKSPACE_NOTEBOOK_OBJECT)
     ws_log = self.get_export_dir() + ws_log_file
     notebook_error_logger = logging_utils.get_error_logger(
         wmconstants.WM_EXPORT, wmconstants.WORKSPACE_NOTEBOOK_OBJECT,
         self.get_export_dir())
     num_notebooks = 0
     if not os.path.exists(ws_log):
         raise Exception(
             "Run --workspace first to download full log of all notebooks.")
     with open(ws_log, "r") as fp:
         # notebook log metadata file now contains object_id to help w/ ACL exports
         # pull the path from the data to download the individual notebook contents
         with ThreadPoolExecutor(max_workers=num_parallel) as executor:
             futures = [
                 executor.submit(self.download_notebook_helper,
                                 notebook_data, checkpoint_notebook_set,
                                 notebook_error_logger,
                                 self.get_export_dir() + ws_dir)
                 for notebook_data in fp
             ]
             for future in concurrent.futures.as_completed(futures):
                 dl_resp = future.result()
                 if 'error' not in dl_resp:
                     num_notebooks += 1
     return num_notebooks
    def log_cluster_configs(self,
                            log_file='clusters.log',
                            acl_log_file='acl_clusters.log',
                            filter_user=None):
        """
        Log the current cluster configs in json file
        :param log_file: log the cluster configs
        :param acl_log_file: log the ACL definitions
        :param filter_user: user name to filter and log the cluster config
        :return:
        """
        cluster_log = self.get_export_dir() + log_file
        acl_cluster_log = self.get_export_dir() + acl_log_file
        error_logger = logging_utils.get_error_logger(
            wmconstants.WM_EXPORT, wmconstants.CLUSTER_OBJECT,
            self.get_export_dir())
        # pinned by cluster_user is a flag per cluster
        cl_raw = self.get_cluster_list(False)
        cluster_list = self.remove_automated_clusters(cl_raw)
        ip_list = self.get('/instance-profiles/list').get(
            'instance_profiles', [])
        nonempty_ip_list = []
        if ip_list:
            # filter none if we hit a profile w/ a none object
            # generate list of registered instance profiles to check cluster configs against
            nonempty_ip_list = list(
                filter(None,
                       [x.get('instance_profile_arn', None) for x in ip_list]))

        # filter on these items as MVP of the cluster configs
        # https://docs.databricks.com/api/latest/clusters.html#request-structure
        with open(cluster_log, 'w') as log_fp, open(acl_cluster_log,
                                                    'w') as acl_log_fp:
            for cluster_json in cluster_list:
                run_properties = set(list(
                    cluster_json.keys())) - self.create_configs
                for p in run_properties:
                    del cluster_json[p]
                if 'aws_attributes' in cluster_json:
                    aws_conf = cluster_json.pop('aws_attributes')
                    iam_role = aws_conf.get('instance_profile_arn', None)
                    if iam_role and ip_list:
                        if iam_role not in nonempty_ip_list:
                            logging.info("Skipping log of default IAM role: " +
                                         iam_role)
                            del aws_conf['instance_profile_arn']
                            cluster_json['aws_attributes'] = aws_conf
                    cluster_json['aws_attributes'] = aws_conf
                cluster_perms = self.get_cluster_acls(
                    cluster_json['cluster_id'], cluster_json['cluster_name'])
                if cluster_perms['http_status_code'] == 200:
                    acl_log_fp.write(json.dumps(cluster_perms) + '\n')
                else:
                    error_logger.error(
                        f'Failed to get cluster ACL: {cluster_perms}')

                if filter_user:
                    if cluster_json['creator_user_name'] == filter_user:
                        log_fp.write(json.dumps(cluster_json) + '\n')
                else:
                    log_fp.write(json.dumps(cluster_json) + '\n')
    def export_user_home(self, username, local_export_dir, num_parallel=4):
        """
        Export the provided user's home directory
        :param username: user's home directory to export
        :param local_export_dir: folder location to do single user exports
        :return: None
        """
        original_export_dir = self.get_export_dir()
        user_export_dir = self.get_export_dir() + local_export_dir
        user_root = '/Users/' + username.rstrip().lstrip()
        self.set_export_dir(user_export_dir + '/{0}/'.format(username))
        print("Export path: {0}".format(self.get_export_dir()))
        os.makedirs(self.get_export_dir(), exist_ok=True)
        workspace_log_writer = ThreadSafeWriter(
            self.get_export_dir() + 'user_workspace.log', "a")
        libs_log_writer = ThreadSafeWriter(
            self.get_export_dir() + 'libraries.log', "a")
        dir_log_writer = ThreadSafeWriter(
            self.get_export_dir() + 'user_dirs.log', "a")
        checkpoint_item_log_set = self._checkpoint_service.get_checkpoint_key_set(
            wmconstants.WM_EXPORT, wmconstants.WORKSPACE_ITEM_LOG_OBJECT)
        try:
            num_of_nbs = self.log_all_workspace_items(user_root,
                                                      workspace_log_writer,
                                                      libs_log_writer,
                                                      dir_log_writer,
                                                      checkpoint_item_log_set)
        finally:
            workspace_log_writer.close()
            libs_log_writer.close()
            dir_log_writer.close()

        if num_of_nbs == 0:
            raise ValueError(
                'User does not have any notebooks in this path. Please verify the case of the email'
            )
        num_of_nbs_dl = self.download_notebooks(ws_dir='user_artifacts/')
        print(f"Total notebooks logged: {num_of_nbs}")
        print(f"Total notebooks downloaded: {num_of_nbs_dl}")
        if num_of_nbs != num_of_nbs_dl:
            print(
                f"Notebooks logged != downloaded. Check the failed download file at: {user_export_dir}"
            )
        print(f"Exporting the notebook permissions for {username}")
        acl_notebooks_writer = ThreadSafeWriter("acl_notebooks.log", "w")
        acl_notebooks_error_logger = logging_utils.get_error_logger(
            wmconstants.WM_EXPORT, wmconstants.WORKSPACE_NOTEBOOK_ACL_OBJECT,
            self.get_export_dir())
        try:
            self.log_acl_to_file('notebooks', 'user_workspace.log',
                                 acl_notebooks_writer,
                                 acl_notebooks_error_logger, num_parallel)
        finally:
            acl_notebooks_writer.close()

        print(f"Exporting the directories permissions for {username}")
        acl_directories_writer = ThreadSafeWriter("acl_directories.log", "w")
        acl_directories_error_logger = logging_utils.get_error_logger(
            wmconstants.WM_EXPORT, wmconstants.WORKSPACE_DIRECTORY_ACL_OBJECT,
            self.get_export_dir())
        try:
            self.log_acl_to_file('directories', 'user_dirs.log',
                                 acl_directories_writer,
                                 acl_directories_error_logger, num_parallel)
        finally:
            acl_directories_writer.close()
        # reset the original export dir for other calls to this method using the same client
        self.set_export_dir(original_export_dir)
    def import_all_secrets(self, log_dir='secret_scopes/'):
        scopes_dir = self.get_export_dir() + log_dir
        error_logger = logging_utils.get_error_logger(
            wmconstants.WM_IMPORT, wmconstants.SECRET_OBJECT,
            self.get_export_dir())
        scopes_acl_dict = self.load_acl_dict()
        for root, subdirs, files in self.walk(scopes_dir):
            for scope_name in files:
                file_path = root + scope_name
                # print('Log file: ', file_path)
                # check if scopes acls are empty, then skip
                if scopes_acl_dict.get(scope_name, None) is None:
                    print(
                        "Scope is empty with no manage permissions. Skipping..."
                    )
                    continue
                # check if users has can manage perms then we can add during creation time
                has_user_manage = self.has_users_can_manage_permission(
                    scope_name, scopes_acl_dict)
                create_scope_args = {'scope': scope_name}
                if has_user_manage:
                    create_scope_args['initial_manage_principal'] = 'users'
                other_permissions = self.get_all_other_permissions(
                    scope_name, scopes_acl_dict)
                create_resp = self.post('/secrets/scopes/create',
                                        create_scope_args)
                logging_utils.log_reponse_error(
                    error_logger,
                    create_resp,
                    ignore_error_list=['RESOURCE_ALREADY_EXISTS'])
                if other_permissions:
                    # use this dict minus the `users:MANAGE` permissions and apply the other permissions to the scope
                    for perm, principal_list in other_permissions.items():
                        put_acl_args = {
                            "scope": scope_name,
                            "permission": perm
                        }
                        for x in principal_list:
                            put_acl_args["principal"] = x
                            logging.info(put_acl_args)
                            put_resp = self.post('/secrets/acls/put',
                                                 put_acl_args)
                            logging_utils.log_reponse_error(
                                error_logger, put_resp)
                # loop through the scope and create the k/v pairs
                with open(file_path, 'r') as fp:
                    for s in fp:
                        s_dict = json.loads(s)
                        k = s_dict.get('name')
                        v = s_dict.get('value')
                        if 'WARNING: skipped' in v:
                            error_logger.error(
                                f"Skipping scope {scope_name} as value is corrupted due to being too large \n"
                            )
                            continue
                        try:
                            put_secret_args = {
                                'scope':
                                scope_name,
                                'key':
                                k,
                                'string_value':
                                base64.b64decode(
                                    v.encode('ascii')).decode('ascii')
                            }
                            put_resp = self.post('/secrets/put',
                                                 put_secret_args)
                            logging_utils.log_reponse_error(
                                error_logger, put_resp)
                        except Exception as error:
                            if "Invalid base64-encoded string" in str(
                                    error) or 'decode' in str(
                                        error) or "padding" in str(error):
                                error_msg = f"secret_scope: {scope_name} has invalid invalid data characters: {str(error)} skipping.. and logging to error file."
                                logging.error(error_msg)
                                error_logger.error(error_msg)

                            else:
                                raise error
Exemple #21
0
    def import_mlflow_runs(
            self,
            src_client_config,
            log_sql_file='mlflow_runs.db',
            experiment_id_map_log='mlflow_experiments_id_map.log',
            run_id_map_log='mlflow_runs_id_map.log',
            ml_run_artifacts_dir='ml_run_artifacts/',
            num_parallel=4):
        """
        Imports the Mlflow run objects. This can be run only after import_mlflow_experiments is complete.
        Input files are mlflow_runs.db, mlflow_experiments_id_map.log
        Outputs mlflow_runs_id_map.log which has the map of old_run_id -> new_run_id after imports.
        """
        src_client = MlflowClient(
            f"databricks://{src_client_config['profile']}")
        experiment_id_map = self._load_experiment_id_map(self.export_dir +
                                                         experiment_id_map_log)
        mlflow_runs_file = self.export_dir + log_sql_file
        os.makedirs(self.export_dir + ml_run_artifacts_dir, exist_ok=True)

        error_logger = logging_utils.get_error_logger(
            wmconstants.WM_IMPORT, wmconstants.MLFLOW_RUN_OBJECT,
            self.export_dir)

        # checkpoint is required since the checkpoint file is copied into mlflow_runs_id_map.log at the end of the step.
        assert self._checkpoint_service.checkpoint_enabled, "import_mlflow_runs requires --use-checkpoint to be enabled. If " \
                                                            " you need to actually rerun, remove the corresponding " \
                                                            "checkpoint file. e.g. logs/checkpoint/import_mlflow_runs.log"

        mlflow_runs_checkpointer = self._checkpoint_service.get_checkpoint_key_map(
            wmconstants.WM_IMPORT, wmconstants.MLFLOW_RUN_OBJECT)

        # This checkpointer is used to checkpoint individual steps for more optimal checkpointing.
        # e.g. checkpoint run_creation, log_batch, and artifact download_upload separately
        mlflow_runs_steps_checkpointer = self._checkpoint_service.get_checkpoint_key_map(
            wmconstants.WM_IMPORT, wmconstants.MLFLOW_RUN_OBJECT + "_steps")

        start = timer()

        con = sqlite3.connect(mlflow_runs_file)
        cur = con.execute("SELECT * FROM runs")
        # TODO(kevin): make this configurable later
        runs = cur.fetchmany(10000)
        while (len(runs) > 0):
            with ThreadPoolExecutor(max_workers=num_parallel) as executor:
                # run_id = run[0]
                # start_time = run[1]
                # run_obj = json.loads(run[2])
                futures = [
                    executor.submit(self._create_run_and_log, src_client,
                                    mlflow_runs_file, run[0], run[1],
                                    json.loads(run[2]), experiment_id_map,
                                    self.export_dir + ml_run_artifacts_dir,
                                    error_logger, mlflow_runs_checkpointer,
                                    mlflow_runs_steps_checkpointer)
                    for run in runs
                ]
                concurrent.futures.wait(futures, return_when="FIRST_EXCEPTION")
                propagate_exceptions(futures)

            runs = cur.fetchmany(10000)
        shutil.copy(mlflow_runs_checkpointer.get_file_path(),
                    self.export_dir + run_id_map_log)
        con.close()
        end = timer()
        logging.info("Complete MLflow Runs Import Time: " +
                     str(timedelta(end - start)))
Exemple #22
0
    def import_job_configs(self, log_file='jobs.log', acl_file='acl_jobs.log'):
        jobs_log = self.get_export_dir() + log_file
        acl_jobs_log = self.get_export_dir() + acl_file
        error_logger = logging_utils.get_error_logger(wmconstants.WM_IMPORT,
                                                      wmconstants.JOB_OBJECT,
                                                      self.get_export_dir())
        if not os.path.exists(jobs_log):
            logging.info("No job configurations to import.")
            return
        # get an old cluster id to new cluster id mapping object
        cluster_mapping = self.get_cluster_id_mapping()
        old_2_new_policy_ids = self.get_new_policy_id_dict(
        )  # dict { old_policy_id : new_policy_id }
        checkpoint_job_configs_set = self._checkpoint_service.get_checkpoint_key_set(
            wmconstants.WM_IMPORT, wmconstants.JOB_OBJECT)

        def adjust_ids_for_cluster(settings):  #job_settings or task_settings
            if 'existing_cluster_id' in settings:
                old_cid = settings['existing_cluster_id']
                # set new cluster id for existing cluster attribute
                new_cid = cluster_mapping.get(old_cid, None)
                if not new_cid:
                    logging.info(
                        "Existing cluster has been removed. Resetting job to use new cluster."
                    )
                    settings.pop('existing_cluster_id')
                    settings[
                        'new_cluster'] = self.get_jobs_default_cluster_conf()
                else:
                    settings['existing_cluster_id'] = new_cid
            else:  # new cluster config
                cluster_conf = settings['new_cluster']
                if 'policy_id' in cluster_conf:
                    old_policy_id = cluster_conf['policy_id']
                    cluster_conf['policy_id'] = old_2_new_policy_ids[
                        old_policy_id]
                # check for instance pools and modify cluster attributes
                if 'instance_pool_id' in cluster_conf:
                    new_cluster_conf = self.cleanup_cluster_pool_configs(
                        cluster_conf, job_creator, True)
                else:
                    new_cluster_conf = cluster_conf
                settings['new_cluster'] = new_cluster_conf

        with open(jobs_log, 'r') as fp:
            for line in fp:
                job_conf = json.loads(line)
                # need to do str(...), otherwise the job_id is recognized as integer which becomes
                # str vs int which never matches.
                # (in which case, the checkpoint never recognizes that the job_id is already checkpointed)
                if 'job_id' in job_conf and checkpoint_job_configs_set.contains(
                        str(job_conf['job_id'])):
                    continue
                job_creator = job_conf.get('creator_user_name', '')
                job_settings = job_conf['settings']
                job_schedule = job_settings.get('schedule', None)
                if job_schedule:
                    # set all imported jobs as paused
                    job_schedule['pause_status'] = 'PAUSED'
                    job_settings['schedule'] = job_schedule
                if 'format' not in job_settings or job_settings.get(
                        'format') == 'SINGLE_TASK':
                    adjust_ids_for_cluster(job_settings)
                else:
                    for task_settings in job_settings.get('tasks', []):
                        adjust_ids_for_cluster(task_settings)

                logging.info("Current Job Name: {0}".format(
                    job_conf['settings']['name']))
                # creator can be none if the user is no longer in the org. see our docs page
                create_resp = self.post('/jobs/create', job_settings)
                if logging_utils.check_error(create_resp):
                    logging.info(
                        "Resetting job to use default cluster configs due to expired configurations."
                    )
                    job_settings[
                        'new_cluster'] = self.get_jobs_default_cluster_conf()
                    create_resp_retry = self.post('/jobs/create', job_settings)
                    if not logging_utils.log_reponse_error(
                            error_logger, create_resp_retry):
                        if 'job_id' in job_conf:
                            checkpoint_job_configs_set.write(
                                job_conf["job_id"])
                    else:
                        raise RuntimeError(
                            "Import job has failed. Refer to the previous log messages to investigate."
                        )

                else:
                    if 'job_id' in job_conf:
                        checkpoint_job_configs_set.write(job_conf["job_id"])

        # update the jobs with their ACLs
        with open(acl_jobs_log, 'r') as acl_fp:
            job_id_by_name = self.get_job_id_by_name()
            for line in acl_fp:
                acl_conf = json.loads(line)
                if 'object_id' in acl_conf and checkpoint_job_configs_set.contains(
                        acl_conf['object_id']):
                    continue
                current_job_id = job_id_by_name[acl_conf['job_name']]
                job_path = f'jobs/{current_job_id}'  # contains `/jobs/{job_id}` path
                api = f'/preview/permissions/{job_path}'
                # get acl permissions for jobs
                acl_perms = self.build_acl_args(
                    acl_conf['access_control_list'], True)
                acl_create_args = {'access_control_list': acl_perms}
                acl_resp = self.patch(api, acl_create_args)
                if not logging_utils.log_reponse_error(
                        error_logger, acl_resp) and 'object_id' in acl_conf:
                    checkpoint_job_configs_set.write(acl_conf['object_id'])
                else:
                    raise RuntimeError(
                        "Import job has failed. Refer to the previous log messages to investigate."
                    )
        # update the imported job names
        self.update_imported_job_names(error_logger,
                                       checkpoint_job_configs_set)
Exemple #23
0
    def import_hive_metastore(self,
                              cluster_name=None,
                              metastore_dir='metastore/',
                              views_dir='metastore_views/',
                              has_unicode=False,
                              should_repair_table=False):
        metastore_local_dir = self.get_export_dir() + metastore_dir
        metastore_view_dir = self.get_export_dir() + views_dir
        error_logger = logging_utils.get_error_logger(
            wmconstants.WM_IMPORT, wmconstants.METASTORE_TABLES,
            self.get_export_dir())
        checkpoint_metastore_set = self._checkpoint_service.get_checkpoint_key_set(
            wmconstants.WM_IMPORT, wmconstants.METASTORE_TABLES)
        os.makedirs(metastore_view_dir, exist_ok=True)
        (cid, ec_id) = self.get_or_launch_cluster(cluster_name)
        # get local databases
        db_list = self.listdir(metastore_local_dir)
        # make directory in DBFS root bucket path for tmp data
        self.post('/dbfs/mkdirs', {'path': '/tmp/migration/'})
        # iterate over the databases saved locally
        all_db_details_json = self.get_database_detail_dict()
        for db_name in db_list:
            # create a dir to host the view ddl if we find them
            os.makedirs(metastore_view_dir + db_name, exist_ok=True)
            # get the local database path to list tables
            local_db_path = metastore_local_dir + db_name
            # get a dict of the database attributes
            database_attributes = all_db_details_json.get(db_name, {})
            if not database_attributes:
                logging.info(all_db_details_json)
                raise ValueError(
                    'Missing Database Attributes Log. Re-run metastore export')
            create_db_resp = self.create_database_db(db_name, ec_id, cid,
                                                     database_attributes)
            if logging_utils.log_reponse_error(error_logger, create_db_resp):
                logging.error(
                    f"Failed to create database {db_name} during metastore import. Exiting Import."
                )
                return
            db_path = database_attributes.get('Location')
            if os.path.isdir(local_db_path):
                # all databases should be directories, no files at this level
                # list all the tables in the database local dir
                tables = self.listdir(local_db_path)
                for tbl_name in tables:
                    # build the path for the table where the ddl is stored
                    full_table_name = f"{db_name}.{tbl_name}"
                    if not checkpoint_metastore_set.contains(full_table_name):
                        logging.info(f"Importing table {full_table_name}")
                        local_table_ddl = metastore_local_dir + db_name + '/' + tbl_name
                        if not self.move_table_view(db_name, tbl_name,
                                                    local_table_ddl):
                            # we hit a table ddl here, so we apply the ddl
                            resp = self.apply_table_ddl(
                                local_table_ddl, ec_id, cid, db_path,
                                has_unicode)
                            if not logging_utils.log_reponse_error(
                                    error_logger, resp):
                                checkpoint_metastore_set.write(full_table_name)
                        else:
                            logging.info(
                                f'Moving view ddl to re-apply later: {db_name}.{tbl_name}'
                            )
            else:
                logging.error(
                    "Error: Only databases should exist at this level: {0}".
                    format(db_name))
            self.delete_dir_if_empty(metastore_view_dir + db_name)
        views_db_list = self.listdir(metastore_view_dir)
        for db_name in views_db_list:
            local_view_db_path = metastore_view_dir + db_name
            database_attributes = all_db_details_json.get(db_name, '')
            db_path = database_attributes.get('Location')
            if os.path.isdir(local_view_db_path):
                views = self.listdir(local_view_db_path)
                for view_name in views:
                    full_view_name = f'{db_name}.{view_name}'
                    if not checkpoint_metastore_set.contains(full_view_name):
                        logging.info(f"Importing view {full_view_name}")
                        local_view_ddl = metastore_view_dir + db_name + '/' + view_name
                        resp = self.apply_table_ddl(local_view_ddl, ec_id, cid,
                                                    db_path, has_unicode)
                        if logging_utils.log_reponse_error(error_logger, resp):
                            checkpoint_metastore_set.write(full_view_name)
                        logging.info(resp)

        # repair legacy tables
        if should_repair_table:
            self.report_legacy_tables_to_fix()
            self.repair_legacy_tables(cluster_name)
Exemple #24
0
    def export_hive_metastore(self,
                              cluster_name=None,
                              metastore_dir='metastore/',
                              db_log='database_details.log',
                              success_log='success_metastore.log',
                              has_unicode=False):
        start = timer()
        checkpoint_metastore_set = self._checkpoint_service.get_checkpoint_key_set(
            wmconstants.WM_EXPORT, wmconstants.METASTORE_TABLES)
        error_logger = logging_utils.get_error_logger(
            wmconstants.WM_EXPORT, wmconstants.METASTORE_TABLES,
            self.get_export_dir())
        instance_profiles = self.get_instance_profiles_list()
        if cluster_name:
            cid = self.start_cluster_by_name(cluster_name)
            current_iam_role = self.get_iam_role_by_cid(cid)
        elif instance_profiles:
            # if any instance profile exists, lets start w/ this on the first cluster to launch and export
            current_iam_role = instance_profiles[0]
            cid = self.launch_cluster(iam_role=current_iam_role)
        else:
            current_iam_role = None
            cid = self.launch_cluster()
        end = timer()
        logging.info("Cluster creation time: " +
                     str(timedelta(seconds=end - start)))
        time.sleep(5)
        ec_id = self.get_execution_context(cid)
        # if metastore failed log path exists, cleanup before re-running
        success_metastore_log_path = self.get_export_dir() + success_log
        database_logfile = self.get_export_dir() + db_log
        if os.path.exists(success_metastore_log_path):
            os.remove(success_metastore_log_path)
        all_dbs = self.get_all_databases(error_logger, cid, ec_id)
        resp = self.set_desc_database_helper(cid, ec_id)
        if self.is_verbose():
            logging.info(resp)
        with open(database_logfile, 'w') as fp:
            for db_name in all_dbs:
                logging.info(f"Fetching details from database: {db_name}")
                os.makedirs(self.get_export_dir() + metastore_dir + db_name,
                            exist_ok=True)
                db_json = self.get_desc_database_details(db_name, cid, ec_id)
                fp.write(json.dumps(db_json) + '\n')
                self.log_all_tables(db_name, cid, ec_id, metastore_dir,
                                    error_logger, success_metastore_log_path,
                                    current_iam_role, checkpoint_metastore_set,
                                    has_unicode)

        failed_log_file = logging_utils.get_error_log_file(
            wmconstants.WM_EXPORT, wmconstants.METASTORE_TABLES,
            self.get_export_dir())
        total_failed_entries = self.get_num_of_lines(failed_log_file)
        if (not self.is_skip_failed()
            ) and self.is_aws() and total_failed_entries > 0:
            logging.info(
                "Retrying failed metastore export with registered IAM roles")
            remaining_iam_roles = instance_profiles[1:]
            self.retry_failed_metastore_export(cid, failed_log_file,
                                               error_logger,
                                               remaining_iam_roles,
                                               success_metastore_log_path,
                                               has_unicode,
                                               checkpoint_metastore_set)
            logging.info("Failed count before retry: " +
                         str(total_failed_entries))
            logging.info("Total Databases attempted export: " +
                         str(len(all_dbs)))
        else:
            logging.error("Failed count: " + str(total_failed_entries))
            logging.info("Total Databases attempted export: " +
                         str(len(all_dbs)))
    def import_user_home(self, username, local_export_dir):
        """
        Import the provided user's home directory
        logs/user_exports/{{USERNAME}}/ stores the log files to understand what was exported
        logs/user_exports/{{USERNAME}}/user_artifacts/ stores the notebook contents
        :param username: user's home directory to export
        :param local_export_dir: the log directory for this users workspace items
        :return: None
        """
        original_export_dir = self.get_export_dir()
        user_import_dir = self.get_export_dir() + local_export_dir
        if self.does_user_exist(username):
            print("Yes, we can upload since the user exists")
        else:
            print(
                "User must exist before we upload the notebook contents. Please add the user to the platform first"
            )
        user_root = '/Users/' + username.rstrip().lstrip()
        self.set_export_dir(user_import_dir + '/{0}/'.format(username))
        print("Import local path: {0}".format(self.get_export_dir()))
        notebook_dir = self.get_export_dir() + 'user_artifacts/'
        for root, subdirs, files in self.walk(notebook_dir):
            upload_dir = '/' + root.replace(notebook_dir, '')
            # if the upload dir is the 2 root directories, skip and continue
            if upload_dir == '/' or upload_dir == '/Users':
                continue
            if not self.is_user_ws_root(upload_dir):
                # if it is not the /Users/[email protected]/ root path, don't create the folder
                resp_mkdirs = self.post(WS_MKDIRS, {'path': upload_dir})
                print(resp_mkdirs)
            for f in files:
                # get full path for the local notebook file
                local_file_path = os.path.join(root, f)
                # create upload path and remove file format extension
                ws_file_path = upload_dir + '/' + f
                # generate json args with binary data for notebook to upload to the workspace path
                nb_input_args = self.get_user_import_args(
                    local_file_path, ws_file_path)
                # call import to the workspace
                if self.is_verbose():
                    print("Path: {0}".format(nb_input_args['path']))
                resp_upload = self.post(WS_IMPORT, nb_input_args)
                if self.is_verbose():
                    print(resp_upload)

        # import the user's workspace ACLs
        notebook_acl_logs = user_import_dir + f'/{username}/acl_notebooks.log'
        acl_notebooks_error_logger = logging_utils.get_error_logger(
            wmconstants.WM_IMPORT, wmconstants.WORKSPACE_NOTEBOOK_ACL_OBJECT,
            self.get_export_dir())
        if os.path.exists(notebook_acl_logs):
            print(f"Importing the notebook acls for {username}")
            with open(notebook_acl_logs) as nb_acls_fp:
                for nb_acl_str in nb_acls_fp:
                    self.apply_acl_on_object(nb_acl_str,
                                             acl_notebooks_error_logger)

        dir_acl_logs = user_import_dir + f'/{username}/acl_directories.log'
        acl_dir_error_logger = logging_utils.get_error_logger(
            wmconstants.WM_IMPORT, wmconstants.WORKSPACE_DIRECTORY_ACL_OBJECT,
            self.get_export_dir())
        if os.path.exists(dir_acl_logs):
            print(f"Importing the directory acls for {username}")
            with open(dir_acl_logs) as dir_acls_fp:
                for dir_acl_str in dir_acls_fp:
                    self.apply_acl_on_object(dir_acl_str, acl_dir_error_logger)
        self.set_export_dir(original_export_dir)
    def import_cluster_configs(self,
                               log_file='clusters.log',
                               acl_log_file='acl_clusters.log',
                               filter_user=None):
        """
        Import cluster configs and update appropriate properties / tags in the new env
        :param log_file:
        :return:
        """
        cluster_log = self.get_export_dir() + log_file
        acl_cluster_log = self.get_export_dir() + acl_log_file
        if not os.path.exists(cluster_log):
            logging.info("No clusters to import.")
            return
        current_cluster_names = set([
            x.get('cluster_name', None) for x in self.get_cluster_list(False)
        ])
        old_2_new_policy_ids = self.get_new_policy_id_dict(
        )  # dict of {old_id : new_id}
        error_logger = logging_utils.get_error_logger(
            wmconstants.WM_IMPORT, wmconstants.CLUSTER_OBJECT,
            self.get_export_dir())
        checkpoint_cluster_configs_set = self._checkpoint_service.get_checkpoint_key_set(
            wmconstants.WM_IMPORT, wmconstants.CLUSTER_OBJECT)
        # get instance pool id mappings
        with open(cluster_log, 'r') as fp:
            for line in fp:
                cluster_conf = json.loads(line)
                if 'cluster_id' in cluster_conf and checkpoint_cluster_configs_set.contains(
                        cluster_conf['cluster_id']):
                    continue
                cluster_name = cluster_conf['cluster_name']
                if cluster_name in current_cluster_names:
                    logging.info(
                        "Cluster already exists, skipping: {0}".format(
                            cluster_name))
                    continue
                cluster_creator = cluster_conf.pop('creator_user_name')
                if 'policy_id' in cluster_conf:
                    old_policy_id = cluster_conf['policy_id']
                    cluster_conf['policy_id'] = old_2_new_policy_ids[
                        old_policy_id]
                # check for instance pools and modify cluster attributes
                if 'instance_pool_id' in cluster_conf:
                    new_cluster_conf = self.cleanup_cluster_pool_configs(
                        cluster_conf, cluster_creator)
                else:
                    # update cluster configs for non-pool clusters
                    # add original creator tag to help with DBU tracking
                    if 'custom_tags' in cluster_conf:
                        tags = cluster_conf['custom_tags']
                        tags['OriginalCreator'] = cluster_creator
                        cluster_conf['custom_tags'] = tags
                    else:
                        cluster_conf['custom_tags'] = {
                            'OriginalCreator': cluster_creator
                        }
                    new_cluster_conf = cluster_conf
                print("Creating cluster: {0}".format(
                    new_cluster_conf['cluster_name']))
                cluster_resp = self.post('/clusters/create', new_cluster_conf)
                if cluster_resp['http_status_code'] == 200:
                    stop_resp = self.post(
                        '/clusters/delete',
                        {'cluster_id': cluster_resp['cluster_id']})
                    if 'pinned_by_user_name' in cluster_conf:
                        pin_resp = self.post(
                            '/clusters/pin',
                            {'cluster_id': cluster_resp['cluster_id']})
                    if 'cluster_id' in cluster_conf:
                        checkpoint_cluster_configs_set.write(
                            cluster_conf['cluster_id'])
                else:
                    logging_utils.log_reponse_error(error_logger, cluster_resp)
                    print(cluster_resp)

        # TODO: May be put it into a separate step to make it more rerunnable.
        self._log_cluster_ids_and_original_creators(log_file)

        # add cluster ACLs
        # loop through and reapply cluster ACLs
        with open(acl_cluster_log, 'r') as acl_fp:
            for x in acl_fp:
                data = json.loads(x)
                if 'object_id' in data and checkpoint_cluster_configs_set.contains(
                        data['object_id']):
                    continue
                cluster_name = data['cluster_name']
                print(f'Applying acl for {cluster_name}')
                acl_args = {
                    'access_control_list':
                    self.build_acl_args(data['access_control_list'])
                }
                cid = self.get_cluster_id_by_name(cluster_name)
                if cid is None:
                    error_message = f'Cluster id must exist in new env for cluster_name: {cluster_name}. ' \
                                    f'Re-import cluster configs.'
                    raise ValueError(error_message)
                api = f'/preview/permissions/clusters/{cid}'
                resp = self.put(api, acl_args)
                if not logging_utils.log_reponse_error(error_logger, resp):
                    if 'object_id' in data:
                        checkpoint_cluster_configs_set.write(data['object_id'])
                print(resp)
    def import_all_workspace_items(self,
                                   artifact_dir='artifacts/',
                                   archive_missing=False,
                                   num_parallel=4):
        """
        import all notebooks into a new workspace. Walks the entire artifacts/ directory in parallel, and also
        upload all the files in each of the directories in parallel.

        WARNING: Because it parallelizes both on directory walking and file uploading, it can spawn as many threads as
                 num_parallel * num_parallel

        :param artifact_dir: notebook download directory
        :param failed_log: failed import log
        :param archive_missing: whether to put missing users into a /Archive/ top level directory
        """
        src_dir = self.get_export_dir() + artifact_dir
        error_logger = logging_utils.get_error_logger(
            wmconstants.WM_IMPORT, wmconstants.WORKSPACE_NOTEBOOK_OBJECT,
            self.get_export_dir())

        checkpoint_notebook_set = self._checkpoint_service.get_checkpoint_key_set(
            wmconstants.WM_IMPORT, wmconstants.WORKSPACE_NOTEBOOK_OBJECT)
        num_exported_users = self.get_num_of_saved_users(src_dir)
        num_current_users = self.get_current_users()
        if num_current_users == 0:
            logging.info(
                "No registered users in existing environment. Please import users / groups first."
            )
            raise ValueError("No registered users in the current environment")
        if (num_current_users < num_exported_users) and (not archive_missing):
            logging.info("Exported number of user workspaces: {0}".format(
                num_exported_users))
            logging.info("Current number of user workspaces: {0}".format(
                num_current_users))
            logging.info(
                "Re-run with the `--archive-missing` flag to load missing users into a separate directory"
            )
            raise ValueError(
                "Current number of users is less than number of user workspaces to import."
            )
        archive_users = set()

        def _upload_all_files(root, subdirs, files):
            '''
            Upload all files in parallel in root (current) directory.
            '''
            # replace the local directory with empty string to get the notebook workspace directory
            nb_dir = '/' + root.replace(src_dir, '')
            upload_dir = nb_dir
            if not nb_dir == '/':
                upload_dir = nb_dir + '/'
            if self.is_user_ws_item(upload_dir):
                ws_user = self.get_user(upload_dir)
                if archive_missing:
                    if ws_user in archive_users:
                        upload_dir = upload_dir.replace('Users', 'Archive', 1)
                    elif not self.does_user_exist(ws_user):
                        # add the user to the cache / set of missing users
                        logging.info(
                            "User workspace does not exist, adding to archive cache: {0}"
                            .format(ws_user))
                        archive_users.add(ws_user)
                        # append the archive path to the upload directory
                        upload_dir = upload_dir.replace('Users', 'Archive', 1)
                    else:
                        logging.info(
                            "User workspace exists: {0}".format(ws_user))
                elif not self.does_user_exist(ws_user):
                    logging.info(
                        "User {0} is missing. "
                        "Please re-run with --archive-missing flag "
                        "or first verify all users exist in the new workspace".
                        format(ws_user))
                    return
                else:
                    logging.info("Uploading for user: {0}".format(ws_user))
            # make the top level folder before uploading files within the loop
            if not self.is_user_ws_root(upload_dir):
                # if it is not the /Users/[email protected]/ root path, don't create the folder
                resp_mkdirs = self.post(WS_MKDIRS, {'path': upload_dir})
                if 'error_code' in resp_mkdirs:
                    resp_mkdirs['path'] = upload_dir
                    logging_utils.log_reponse_error(error_logger, resp_mkdirs)

            def _file_upload_helper(f):
                logging.info("Uploading: {0}".format(f))
                # create the local file path to load the DBC file
                local_file_path = os.path.join(root, f)
                # create the ws full file path including filename
                ws_file_path = upload_dir + f
                if checkpoint_notebook_set.contains(ws_file_path):
                    return
                # generate json args with binary data for notebook to upload to the workspace path
                nb_input_args = self.get_user_import_args(
                    local_file_path, ws_file_path)
                # call import to the workspace
                if self.is_verbose():
                    logging.info("Path: {0}".format(nb_input_args['path']))
                resp_upload = self.post(WS_IMPORT, nb_input_args)
                if 'error_code' in resp_upload:
                    resp_upload['path'] = ws_file_path
                    logging.info(f'Error uploading file: {ws_file_path}')
                    logging_utils.log_reponse_error(error_logger, resp_upload)
                else:
                    checkpoint_notebook_set.write(ws_file_path)

            with ThreadPoolExecutor(max_workers=num_parallel) as executor:
                futures = [
                    executor.submit(_file_upload_helper, file)
                    for file in files
                ]
                concurrent.futures.wait(futures, return_when="FIRST_EXCEPTION")
                propagate_exceptions(futures)

        with ThreadPoolExecutor(max_workers=num_parallel) as executor:
            futures = [
                executor.submit(_upload_all_files, walk[0], walk[1], walk[2])
                for walk in self.walk(src_dir)
            ]
            concurrent.futures.wait(futures, return_when="FIRST_EXCEPTION")
            propagate_exceptions(futures)