def log_acl_to_file(self, artifact_type, read_log_filename, writer, error_logger, num_parallel): """ generic function to log the notebook/directory ACLs to specific file names :param artifact_type: set('notebooks', 'directories') ACLs to be logged :param read_log_filename: the list of the notebook paths / object ids :param write_log_filename: output file to store object_id acls :param error_logger: logger to log errors """ read_log_path = self.get_export_dir() + read_log_filename if not os.path.exists(read_log_path): logging.info( f"No log exists for {read_log_path}. Skipping ACL export ...") return def _acl_log_helper(json_data): data = json.loads(json_data) obj_id = data.get('object_id', None) api_endpoint = '/permissions/{0}/{1}'.format(artifact_type, obj_id) acl_resp = self.get(api_endpoint) acl_resp['path'] = data.get('path') if logging_utils.log_reponse_error(error_logger, acl_resp): return acl_resp.pop('http_status_code') writer.write(json.dumps(acl_resp) + '\n') with open(read_log_path, 'r') as read_fp: with ThreadPoolExecutor(max_workers=num_parallel) as executor: futures = [ executor.submit(_acl_log_helper, json_data) for json_data in read_fp ] concurrent.futures.wait(futures, return_when="FIRST_EXCEPTION") propagate_exceptions(futures)
def import_mlflow_experiments_acls( self, acl_log='mlflow_experiments_acls.log', experiment_id_map_log='mlflow_experiments_id_map.log', num_parallel=4): """ Import all experiments' permissions which are already exported in acl_log file. Finds out the new_experiment_id by looking up experiment_id_map_log file. While the permissions are persisted, the original creator (tagged as Created By label) is not persisted. The creator will always be set as the caller of this script. """ experiment_id_map = self._load_experiment_id_map(self.export_dir + experiment_id_map_log) acl_log_file = self.get_export_dir() + acl_log error_logger = logging_utils.get_error_logger( wmconstants.WM_IMPORT, wmconstants.MLFLOW_EXPERIMENT_PERMISSION_OBJECT, self.get_export_dir()) checkpoint_key_set = self._checkpoint_service.get_checkpoint_key_set( wmconstants.WM_IMPORT, wmconstants.MLFLOW_EXPERIMENT_PERMISSION_OBJECT) start = timer() with open(acl_log_file, 'r') as fp: with ThreadPoolExecutor(max_workers=num_parallel) as executor: futures = [ executor.submit(self._put_mlflow_experiment_acl, acl_str, experiment_id_map, checkpoint_key_set, error_logger) for acl_str in fp ] concurrent.futures.wait(futures, return_when="FIRST_EXCEPTION") propagate_exceptions(futures) end = timer() logging.info("Complete MLflow Experiments Permissions Import Time: " + str(timedelta(seconds=end - start)))
def test_write_with_thread_safe_writer_multithread(self): f1 = "test/thread_safe_writer/test_file_3.log" f2 = "test/thread_safe_writer/test_file_4.log" list_to_write = [i for i in range(10000)] with open(f1, "w") as write_fp: for data in list_to_write: write_fp.write(str(data) + "\n") file_writer = ThreadSafeWriter(f2, "w") with concurrent.futures.ThreadPoolExecutor(max_workers=20) as executor: futures = [ executor.submit(file_writer.write, str(data) + "\n") for data in list_to_write ] concurrent.futures.wait(futures) propagate_exceptions(futures) file_writer.close() fp1 = open(f1, "r") fp2 = open(f2, "r") f1_lines = fp1.readlines() f2_lines = fp2.readlines() fp1.close() fp2.close() # since it is multi thread writing to the same file, the order is not guaranteed. # hence we test the content equality by sorting and then comparing. assert (not filecmp.cmp(f1, f2)) assert (f1_lines.sort() == f2_lines.sort()) os.remove(f1) os.remove(f2)
def import_users(self, user_log, error_logger, checkpoint_set, num_parallel): # first create the user identities with the required fields create_keys = ('emails', 'entitlements', 'displayName', 'name', 'userName') if not os.path.exists(user_log): logging.info("No users to import.") return with open(user_log, 'r') as fp: with ThreadPoolExecutor(max_workers=num_parallel) as executor: futures = [executor.submit(self._import_users_helper, user_data, create_keys, checkpoint_set, error_logger) for user_data in fp] concurrent.futures.wait(futures, return_when="FIRST_EXCEPTION") propagate_exceptions(futures) with open(self.get_export_dir() + "user_name_to_user_id.log", 'w') as fp: fp.write(json.dumps(self.get_user_id_mapping()))
def test_should_propagate_exception(self): def do_something_good(): return 'howdy' def do_something_bad(): raise MyBadException('something bad happened') def run_stuff(): fut1 = ThreadPoolExecutor(2).submit(do_something_good) fut2 = ThreadPoolExecutor(2).submit(do_something_bad) return fut1, fut2 with self.assertRaises(MyBadException): futures = run_stuff() concurrent.futures.wait(futures) propagate_exceptions(futures)
def import_workspace_acls(self, workspace_log_file='acl_notebooks.log', dir_log_file='acl_directories.log', num_parallel=1): """ import the notebook and directory acls by looping over notebook and dir logfiles """ dir_acl_logs = self.get_export_dir() + dir_log_file notebook_acl_logs = self.get_export_dir() + workspace_log_file acl_notebooks_error_logger = logging_utils.get_error_logger( wmconstants.WM_IMPORT, wmconstants.WORKSPACE_NOTEBOOK_ACL_OBJECT, self.get_export_dir()) checkpoint_notebook_acl_set = self._checkpoint_service.get_checkpoint_key_set( wmconstants.WM_IMPORT, wmconstants.WORKSPACE_NOTEBOOK_ACL_OBJECT) with open(notebook_acl_logs) as nb_acls_fp: with ThreadPoolExecutor(max_workers=num_parallel) as executor: futures = [ executor.submit(self.apply_acl_on_object, nb_acl_str, acl_notebooks_error_logger, checkpoint_notebook_acl_set) for nb_acl_str in nb_acls_fp ] concurrent.futures.wait(futures, return_when="FIRST_EXCEPTION") propagate_exceptions(futures) acl_dir_error_logger = logging_utils.get_error_logger( wmconstants.WM_IMPORT, wmconstants.WORKSPACE_DIRECTORY_ACL_OBJECT, self.get_export_dir()) checkpoint_dir_acl_set = self._checkpoint_service.get_checkpoint_key_set( wmconstants.WM_IMPORT, wmconstants.WORKSPACE_DIRECTORY_ACL_OBJECT) with open(dir_acl_logs) as dir_acls_fp: with ThreadPoolExecutor(max_workers=num_parallel) as executor: futures = [ executor.submit(self.apply_acl_on_object, dir_acl_str, acl_dir_error_logger, checkpoint_dir_acl_set) for dir_acl_str in dir_acls_fp ] concurrent.futures.wait(futures, return_when="FIRST_EXCEPTION") propagate_exceptions(futures) print("Completed import ACLs of Notebooks and Directories")
def import_mlflow_experiments(self, log_file='mlflow_experiments.log', id_map_file='mlflow_experiments_id_map.log', log_dir=None, num_parallel=4): mlflow_experiments_dir = log_dir if log_dir else self.export_dir experiments_logfile = mlflow_experiments_dir + log_file experiments_id_map_file = mlflow_experiments_dir + id_map_file error_logger = logging_utils.get_error_logger( wmconstants.WM_IMPORT, wmconstants.MLFLOW_EXPERIMENT_OBJECT, self.export_dir) mlflow_experiments_checkpointer = self._checkpoint_service.get_checkpoint_key_set( wmconstants.WM_IMPORT, wmconstants.MLFLOW_EXPERIMENT_OBJECT) start = timer() id_map_thread_safe_writer = ThreadSafeWriter(experiments_id_map_file, 'a') try: with open(experiments_logfile, 'r') as fp: with ThreadPoolExecutor(max_workers=num_parallel) as executor: futures = [ executor.submit(self._create_experiment, experiment_str, id_map_thread_safe_writer, mlflow_experiments_checkpointer, error_logger) for experiment_str in fp ] concurrent.futures.wait(futures, return_when="FIRST_EXCEPTION") propagate_exceptions(futures) finally: id_map_thread_safe_writer.close() end = timer() logging.info("Complete MLflow Experiments Import Time: " + str(timedelta(seconds=end - start)))
def export_mlflow_experiments_acls( self, experiment_log='mlflow_experiments.log', acl_log_file='mlflow_experiments_acls.log', num_parallel=4): """ Export all experiments' permissions of already exported experiment objects logged in experiment_log file. :return: writes the result to acl_log_file """ experiments_logfile = self.export_dir + experiment_log acl_log_file_writer = ThreadSafeWriter(self.export_dir + acl_log_file, 'a') error_logger = logging_utils.get_error_logger( wmconstants.WM_EXPORT, wmconstants.MLFLOW_EXPERIMENT_PERMISSION_OBJECT, self.get_export_dir()) checkpoint_key_set = self._checkpoint_service.get_checkpoint_key_set( wmconstants.WM_EXPORT, wmconstants.MLFLOW_EXPERIMENT_PERMISSION_OBJECT) start = timer() try: with open(experiments_logfile, 'r') as fp: with ThreadPoolExecutor(max_workers=num_parallel) as executor: futures = [ executor.submit(self._get_mlflow_experiment_acls, acl_log_file_writer, experiment_str, checkpoint_key_set, error_logger) for experiment_str in fp ] concurrent.futures.wait(futures, return_when="FIRST_EXCEPTION") propagate_exceptions(futures) finally: acl_log_file_writer.close() end = timer() logging.info("Complete MLflow Experiments Permissions Export Time: " + str(timedelta(seconds=end - start)))
def import_mlflow_runs( self, src_client_config, log_sql_file='mlflow_runs.db', experiment_id_map_log='mlflow_experiments_id_map.log', run_id_map_log='mlflow_runs_id_map.log', ml_run_artifacts_dir='ml_run_artifacts/', num_parallel=4): """ Imports the Mlflow run objects. This can be run only after import_mlflow_experiments is complete. Input files are mlflow_runs.db, mlflow_experiments_id_map.log Outputs mlflow_runs_id_map.log which has the map of old_run_id -> new_run_id after imports. """ src_client = MlflowClient( f"databricks://{src_client_config['profile']}") experiment_id_map = self._load_experiment_id_map(self.export_dir + experiment_id_map_log) mlflow_runs_file = self.export_dir + log_sql_file os.makedirs(self.export_dir + ml_run_artifacts_dir, exist_ok=True) error_logger = logging_utils.get_error_logger( wmconstants.WM_IMPORT, wmconstants.MLFLOW_RUN_OBJECT, self.export_dir) # checkpoint is required since the checkpoint file is copied into mlflow_runs_id_map.log at the end of the step. assert self._checkpoint_service.checkpoint_enabled, "import_mlflow_runs requires --use-checkpoint to be enabled. If " \ " you need to actually rerun, remove the corresponding " \ "checkpoint file. e.g. logs/checkpoint/import_mlflow_runs.log" mlflow_runs_checkpointer = self._checkpoint_service.get_checkpoint_key_map( wmconstants.WM_IMPORT, wmconstants.MLFLOW_RUN_OBJECT) # This checkpointer is used to checkpoint individual steps for more optimal checkpointing. # e.g. checkpoint run_creation, log_batch, and artifact download_upload separately mlflow_runs_steps_checkpointer = self._checkpoint_service.get_checkpoint_key_map( wmconstants.WM_IMPORT, wmconstants.MLFLOW_RUN_OBJECT + "_steps") start = timer() con = sqlite3.connect(mlflow_runs_file) cur = con.execute("SELECT * FROM runs") # TODO(kevin): make this configurable later runs = cur.fetchmany(10000) while (len(runs) > 0): with ThreadPoolExecutor(max_workers=num_parallel) as executor: # run_id = run[0] # start_time = run[1] # run_obj = json.loads(run[2]) futures = [ executor.submit(self._create_run_and_log, src_client, mlflow_runs_file, run[0], run[1], json.loads(run[2]), experiment_id_map, self.export_dir + ml_run_artifacts_dir, error_logger, mlflow_runs_checkpointer, mlflow_runs_steps_checkpointer) for run in runs ] concurrent.futures.wait(futures, return_when="FIRST_EXCEPTION") propagate_exceptions(futures) runs = cur.fetchmany(10000) shutil.copy(mlflow_runs_checkpointer.get_file_path(), self.export_dir + run_id_map_log) con.close() end = timer() logging.info("Complete MLflow Runs Import Time: " + str(timedelta(end - start)))
def _upload_all_files(root, subdirs, files): ''' Upload all files in parallel in root (current) directory. ''' # replace the local directory with empty string to get the notebook workspace directory nb_dir = '/' + root.replace(src_dir, '') upload_dir = nb_dir if not nb_dir == '/': upload_dir = nb_dir + '/' if self.is_user_ws_item(upload_dir): ws_user = self.get_user(upload_dir) if archive_missing: if ws_user in archive_users: upload_dir = upload_dir.replace('Users', 'Archive', 1) elif not self.does_user_exist(ws_user): # add the user to the cache / set of missing users logging.info( "User workspace does not exist, adding to archive cache: {0}" .format(ws_user)) archive_users.add(ws_user) # append the archive path to the upload directory upload_dir = upload_dir.replace('Users', 'Archive', 1) else: logging.info( "User workspace exists: {0}".format(ws_user)) elif not self.does_user_exist(ws_user): logging.info( "User {0} is missing. " "Please re-run with --archive-missing flag " "or first verify all users exist in the new workspace". format(ws_user)) return else: logging.info("Uploading for user: {0}".format(ws_user)) # make the top level folder before uploading files within the loop if not self.is_user_ws_root(upload_dir): # if it is not the /Users/[email protected]/ root path, don't create the folder resp_mkdirs = self.post(WS_MKDIRS, {'path': upload_dir}) if 'error_code' in resp_mkdirs: resp_mkdirs['path'] = upload_dir logging_utils.log_reponse_error(error_logger, resp_mkdirs) def _file_upload_helper(f): logging.info("Uploading: {0}".format(f)) # create the local file path to load the DBC file local_file_path = os.path.join(root, f) # create the ws full file path including filename ws_file_path = upload_dir + f if checkpoint_notebook_set.contains(ws_file_path): return # generate json args with binary data for notebook to upload to the workspace path nb_input_args = self.get_user_import_args( local_file_path, ws_file_path) # call import to the workspace if self.is_verbose(): logging.info("Path: {0}".format(nb_input_args['path'])) resp_upload = self.post(WS_IMPORT, nb_input_args) if 'error_code' in resp_upload: resp_upload['path'] = ws_file_path logging.info(f'Error uploading file: {ws_file_path}') logging_utils.log_reponse_error(error_logger, resp_upload) else: checkpoint_notebook_set.write(ws_file_path) with ThreadPoolExecutor(max_workers=num_parallel) as executor: futures = [ executor.submit(_file_upload_helper, file) for file in files ] concurrent.futures.wait(futures, return_when="FIRST_EXCEPTION") propagate_exceptions(futures)
def import_all_workspace_items(self, artifact_dir='artifacts/', archive_missing=False, num_parallel=4): """ import all notebooks into a new workspace. Walks the entire artifacts/ directory in parallel, and also upload all the files in each of the directories in parallel. WARNING: Because it parallelizes both on directory walking and file uploading, it can spawn as many threads as num_parallel * num_parallel :param artifact_dir: notebook download directory :param failed_log: failed import log :param archive_missing: whether to put missing users into a /Archive/ top level directory """ src_dir = self.get_export_dir() + artifact_dir error_logger = logging_utils.get_error_logger( wmconstants.WM_IMPORT, wmconstants.WORKSPACE_NOTEBOOK_OBJECT, self.get_export_dir()) checkpoint_notebook_set = self._checkpoint_service.get_checkpoint_key_set( wmconstants.WM_IMPORT, wmconstants.WORKSPACE_NOTEBOOK_OBJECT) num_exported_users = self.get_num_of_saved_users(src_dir) num_current_users = self.get_current_users() if num_current_users == 0: logging.info( "No registered users in existing environment. Please import users / groups first." ) raise ValueError("No registered users in the current environment") if (num_current_users < num_exported_users) and (not archive_missing): logging.info("Exported number of user workspaces: {0}".format( num_exported_users)) logging.info("Current number of user workspaces: {0}".format( num_current_users)) logging.info( "Re-run with the `--archive-missing` flag to load missing users into a separate directory" ) raise ValueError( "Current number of users is less than number of user workspaces to import." ) archive_users = set() def _upload_all_files(root, subdirs, files): ''' Upload all files in parallel in root (current) directory. ''' # replace the local directory with empty string to get the notebook workspace directory nb_dir = '/' + root.replace(src_dir, '') upload_dir = nb_dir if not nb_dir == '/': upload_dir = nb_dir + '/' if self.is_user_ws_item(upload_dir): ws_user = self.get_user(upload_dir) if archive_missing: if ws_user in archive_users: upload_dir = upload_dir.replace('Users', 'Archive', 1) elif not self.does_user_exist(ws_user): # add the user to the cache / set of missing users logging.info( "User workspace does not exist, adding to archive cache: {0}" .format(ws_user)) archive_users.add(ws_user) # append the archive path to the upload directory upload_dir = upload_dir.replace('Users', 'Archive', 1) else: logging.info( "User workspace exists: {0}".format(ws_user)) elif not self.does_user_exist(ws_user): logging.info( "User {0} is missing. " "Please re-run with --archive-missing flag " "or first verify all users exist in the new workspace". format(ws_user)) return else: logging.info("Uploading for user: {0}".format(ws_user)) # make the top level folder before uploading files within the loop if not self.is_user_ws_root(upload_dir): # if it is not the /Users/[email protected]/ root path, don't create the folder resp_mkdirs = self.post(WS_MKDIRS, {'path': upload_dir}) if 'error_code' in resp_mkdirs: resp_mkdirs['path'] = upload_dir logging_utils.log_reponse_error(error_logger, resp_mkdirs) def _file_upload_helper(f): logging.info("Uploading: {0}".format(f)) # create the local file path to load the DBC file local_file_path = os.path.join(root, f) # create the ws full file path including filename ws_file_path = upload_dir + f if checkpoint_notebook_set.contains(ws_file_path): return # generate json args with binary data for notebook to upload to the workspace path nb_input_args = self.get_user_import_args( local_file_path, ws_file_path) # call import to the workspace if self.is_verbose(): logging.info("Path: {0}".format(nb_input_args['path'])) resp_upload = self.post(WS_IMPORT, nb_input_args) if 'error_code' in resp_upload: resp_upload['path'] = ws_file_path logging.info(f'Error uploading file: {ws_file_path}') logging_utils.log_reponse_error(error_logger, resp_upload) else: checkpoint_notebook_set.write(ws_file_path) with ThreadPoolExecutor(max_workers=num_parallel) as executor: futures = [ executor.submit(_file_upload_helper, file) for file in files ] concurrent.futures.wait(futures, return_when="FIRST_EXCEPTION") propagate_exceptions(futures) with ThreadPoolExecutor(max_workers=num_parallel) as executor: futures = [ executor.submit(_upload_all_files, walk[0], walk[1], walk[2]) for walk in self.walk(src_dir) ] concurrent.futures.wait(futures, return_when="FIRST_EXCEPTION") propagate_exceptions(futures)