コード例 #1
0
    def log_acl_to_file(self, artifact_type, read_log_filename, writer,
                        error_logger, num_parallel):
        """
        generic function to log the notebook/directory ACLs to specific file names
        :param artifact_type: set('notebooks', 'directories') ACLs to be logged
        :param read_log_filename: the list of the notebook paths / object ids
        :param write_log_filename: output file to store object_id acls
        :param error_logger: logger to log errors
        """
        read_log_path = self.get_export_dir() + read_log_filename
        if not os.path.exists(read_log_path):
            logging.info(
                f"No log exists for {read_log_path}. Skipping ACL export ...")
            return

        def _acl_log_helper(json_data):
            data = json.loads(json_data)
            obj_id = data.get('object_id', None)
            api_endpoint = '/permissions/{0}/{1}'.format(artifact_type, obj_id)
            acl_resp = self.get(api_endpoint)
            acl_resp['path'] = data.get('path')
            if logging_utils.log_reponse_error(error_logger, acl_resp):
                return
            acl_resp.pop('http_status_code')
            writer.write(json.dumps(acl_resp) + '\n')

        with open(read_log_path, 'r') as read_fp:
            with ThreadPoolExecutor(max_workers=num_parallel) as executor:
                futures = [
                    executor.submit(_acl_log_helper, json_data)
                    for json_data in read_fp
                ]
                concurrent.futures.wait(futures, return_when="FIRST_EXCEPTION")
                propagate_exceptions(futures)
コード例 #2
0
    def import_mlflow_experiments_acls(
            self,
            acl_log='mlflow_experiments_acls.log',
            experiment_id_map_log='mlflow_experiments_id_map.log',
            num_parallel=4):
        """
        Import all experiments' permissions which are already exported in acl_log file. Finds out the new_experiment_id
        by looking up experiment_id_map_log file.

        While the permissions are persisted, the original creator (tagged as Created By label) is not persisted.
        The creator will always be set as the caller of this script.
        """
        experiment_id_map = self._load_experiment_id_map(self.export_dir +
                                                         experiment_id_map_log)
        acl_log_file = self.get_export_dir() + acl_log
        error_logger = logging_utils.get_error_logger(
            wmconstants.WM_IMPORT,
            wmconstants.MLFLOW_EXPERIMENT_PERMISSION_OBJECT,
            self.get_export_dir())
        checkpoint_key_set = self._checkpoint_service.get_checkpoint_key_set(
            wmconstants.WM_IMPORT,
            wmconstants.MLFLOW_EXPERIMENT_PERMISSION_OBJECT)
        start = timer()
        with open(acl_log_file, 'r') as fp:
            with ThreadPoolExecutor(max_workers=num_parallel) as executor:
                futures = [
                    executor.submit(self._put_mlflow_experiment_acl, acl_str,
                                    experiment_id_map, checkpoint_key_set,
                                    error_logger) for acl_str in fp
                ]
                concurrent.futures.wait(futures, return_when="FIRST_EXCEPTION")
                propagate_exceptions(futures)
        end = timer()
        logging.info("Complete MLflow Experiments Permissions Import Time: " +
                     str(timedelta(seconds=end - start)))
コード例 #3
0
    def test_write_with_thread_safe_writer_multithread(self):
        f1 = "test/thread_safe_writer/test_file_3.log"
        f2 = "test/thread_safe_writer/test_file_4.log"
        list_to_write = [i for i in range(10000)]
        with open(f1, "w") as write_fp:
            for data in list_to_write:
                write_fp.write(str(data) + "\n")

        file_writer = ThreadSafeWriter(f2, "w")
        with concurrent.futures.ThreadPoolExecutor(max_workers=20) as executor:
            futures = [
                executor.submit(file_writer.write,
                                str(data) + "\n") for data in list_to_write
            ]
            concurrent.futures.wait(futures)
            propagate_exceptions(futures)

        file_writer.close()

        fp1 = open(f1, "r")
        fp2 = open(f2, "r")
        f1_lines = fp1.readlines()
        f2_lines = fp2.readlines()
        fp1.close()
        fp2.close()

        # since it is multi thread writing to the same file, the order is not guaranteed.
        # hence we test the content equality by sorting and then comparing.
        assert (not filecmp.cmp(f1, f2))
        assert (f1_lines.sort() == f2_lines.sort())
        os.remove(f1)
        os.remove(f2)
コード例 #4
0
ファイル: ScimClient.py プロジェクト: databrickslabs/migrate
    def import_users(self, user_log, error_logger, checkpoint_set, num_parallel):
        # first create the user identities with the required fields
        create_keys = ('emails', 'entitlements', 'displayName', 'name', 'userName')
        if not os.path.exists(user_log):
            logging.info("No users to import.")
            return
        with open(user_log, 'r') as fp:
            with ThreadPoolExecutor(max_workers=num_parallel) as executor:
                futures = [executor.submit(self._import_users_helper, user_data, create_keys, checkpoint_set, error_logger) for user_data in fp]
                concurrent.futures.wait(futures, return_when="FIRST_EXCEPTION")
                propagate_exceptions(futures)

        with open(self.get_export_dir() + "user_name_to_user_id.log", 'w') as fp:
            fp.write(json.dumps(self.get_user_id_mapping()))
コード例 #5
0
    def test_should_propagate_exception(self):
        def do_something_good():
            return 'howdy'

        def do_something_bad():
            raise MyBadException('something bad happened')

        def run_stuff():
            fut1 = ThreadPoolExecutor(2).submit(do_something_good)
            fut2 = ThreadPoolExecutor(2).submit(do_something_bad)
            return fut1, fut2

        with self.assertRaises(MyBadException):
            futures = run_stuff()
            concurrent.futures.wait(futures)
            propagate_exceptions(futures)
コード例 #6
0
    def import_workspace_acls(self,
                              workspace_log_file='acl_notebooks.log',
                              dir_log_file='acl_directories.log',
                              num_parallel=1):
        """
        import the notebook and directory acls by looping over notebook and dir logfiles
        """
        dir_acl_logs = self.get_export_dir() + dir_log_file
        notebook_acl_logs = self.get_export_dir() + workspace_log_file
        acl_notebooks_error_logger = logging_utils.get_error_logger(
            wmconstants.WM_IMPORT, wmconstants.WORKSPACE_NOTEBOOK_ACL_OBJECT,
            self.get_export_dir())

        checkpoint_notebook_acl_set = self._checkpoint_service.get_checkpoint_key_set(
            wmconstants.WM_IMPORT, wmconstants.WORKSPACE_NOTEBOOK_ACL_OBJECT)
        with open(notebook_acl_logs) as nb_acls_fp:
            with ThreadPoolExecutor(max_workers=num_parallel) as executor:
                futures = [
                    executor.submit(self.apply_acl_on_object, nb_acl_str,
                                    acl_notebooks_error_logger,
                                    checkpoint_notebook_acl_set)
                    for nb_acl_str in nb_acls_fp
                ]
                concurrent.futures.wait(futures, return_when="FIRST_EXCEPTION")
                propagate_exceptions(futures)

        acl_dir_error_logger = logging_utils.get_error_logger(
            wmconstants.WM_IMPORT, wmconstants.WORKSPACE_DIRECTORY_ACL_OBJECT,
            self.get_export_dir())
        checkpoint_dir_acl_set = self._checkpoint_service.get_checkpoint_key_set(
            wmconstants.WM_IMPORT, wmconstants.WORKSPACE_DIRECTORY_ACL_OBJECT)

        with open(dir_acl_logs) as dir_acls_fp:
            with ThreadPoolExecutor(max_workers=num_parallel) as executor:
                futures = [
                    executor.submit(self.apply_acl_on_object, dir_acl_str,
                                    acl_dir_error_logger,
                                    checkpoint_dir_acl_set)
                    for dir_acl_str in dir_acls_fp
                ]
                concurrent.futures.wait(futures, return_when="FIRST_EXCEPTION")
                propagate_exceptions(futures)
        print("Completed import ACLs of Notebooks and Directories")
コード例 #7
0
    def import_mlflow_experiments(self,
                                  log_file='mlflow_experiments.log',
                                  id_map_file='mlflow_experiments_id_map.log',
                                  log_dir=None,
                                  num_parallel=4):
        mlflow_experiments_dir = log_dir if log_dir else self.export_dir
        experiments_logfile = mlflow_experiments_dir + log_file
        experiments_id_map_file = mlflow_experiments_dir + id_map_file

        error_logger = logging_utils.get_error_logger(
            wmconstants.WM_IMPORT, wmconstants.MLFLOW_EXPERIMENT_OBJECT,
            self.export_dir)
        mlflow_experiments_checkpointer = self._checkpoint_service.get_checkpoint_key_set(
            wmconstants.WM_IMPORT, wmconstants.MLFLOW_EXPERIMENT_OBJECT)
        start = timer()

        id_map_thread_safe_writer = ThreadSafeWriter(experiments_id_map_file,
                                                     'a')

        try:
            with open(experiments_logfile, 'r') as fp:
                with ThreadPoolExecutor(max_workers=num_parallel) as executor:
                    futures = [
                        executor.submit(self._create_experiment,
                                        experiment_str,
                                        id_map_thread_safe_writer,
                                        mlflow_experiments_checkpointer,
                                        error_logger) for experiment_str in fp
                    ]
                    concurrent.futures.wait(futures,
                                            return_when="FIRST_EXCEPTION")
                    propagate_exceptions(futures)
        finally:
            id_map_thread_safe_writer.close()

        end = timer()
        logging.info("Complete MLflow Experiments Import Time: " +
                     str(timedelta(seconds=end - start)))
コード例 #8
0
    def export_mlflow_experiments_acls(
            self,
            experiment_log='mlflow_experiments.log',
            acl_log_file='mlflow_experiments_acls.log',
            num_parallel=4):
        """
        Export all experiments' permissions of already exported experiment objects logged in experiment_log file.
        :return: writes the result to acl_log_file
        """
        experiments_logfile = self.export_dir + experiment_log
        acl_log_file_writer = ThreadSafeWriter(self.export_dir + acl_log_file,
                                               'a')
        error_logger = logging_utils.get_error_logger(
            wmconstants.WM_EXPORT,
            wmconstants.MLFLOW_EXPERIMENT_PERMISSION_OBJECT,
            self.get_export_dir())
        checkpoint_key_set = self._checkpoint_service.get_checkpoint_key_set(
            wmconstants.WM_EXPORT,
            wmconstants.MLFLOW_EXPERIMENT_PERMISSION_OBJECT)

        start = timer()
        try:
            with open(experiments_logfile, 'r') as fp:
                with ThreadPoolExecutor(max_workers=num_parallel) as executor:
                    futures = [
                        executor.submit(self._get_mlflow_experiment_acls,
                                        acl_log_file_writer, experiment_str,
                                        checkpoint_key_set, error_logger)
                        for experiment_str in fp
                    ]
                    concurrent.futures.wait(futures,
                                            return_when="FIRST_EXCEPTION")
                    propagate_exceptions(futures)
        finally:
            acl_log_file_writer.close()
        end = timer()
        logging.info("Complete MLflow Experiments Permissions Export Time: " +
                     str(timedelta(seconds=end - start)))
コード例 #9
0
    def import_mlflow_runs(
            self,
            src_client_config,
            log_sql_file='mlflow_runs.db',
            experiment_id_map_log='mlflow_experiments_id_map.log',
            run_id_map_log='mlflow_runs_id_map.log',
            ml_run_artifacts_dir='ml_run_artifacts/',
            num_parallel=4):
        """
        Imports the Mlflow run objects. This can be run only after import_mlflow_experiments is complete.
        Input files are mlflow_runs.db, mlflow_experiments_id_map.log
        Outputs mlflow_runs_id_map.log which has the map of old_run_id -> new_run_id after imports.
        """
        src_client = MlflowClient(
            f"databricks://{src_client_config['profile']}")
        experiment_id_map = self._load_experiment_id_map(self.export_dir +
                                                         experiment_id_map_log)
        mlflow_runs_file = self.export_dir + log_sql_file
        os.makedirs(self.export_dir + ml_run_artifacts_dir, exist_ok=True)

        error_logger = logging_utils.get_error_logger(
            wmconstants.WM_IMPORT, wmconstants.MLFLOW_RUN_OBJECT,
            self.export_dir)

        # checkpoint is required since the checkpoint file is copied into mlflow_runs_id_map.log at the end of the step.
        assert self._checkpoint_service.checkpoint_enabled, "import_mlflow_runs requires --use-checkpoint to be enabled. If " \
                                                            " you need to actually rerun, remove the corresponding " \
                                                            "checkpoint file. e.g. logs/checkpoint/import_mlflow_runs.log"

        mlflow_runs_checkpointer = self._checkpoint_service.get_checkpoint_key_map(
            wmconstants.WM_IMPORT, wmconstants.MLFLOW_RUN_OBJECT)

        # This checkpointer is used to checkpoint individual steps for more optimal checkpointing.
        # e.g. checkpoint run_creation, log_batch, and artifact download_upload separately
        mlflow_runs_steps_checkpointer = self._checkpoint_service.get_checkpoint_key_map(
            wmconstants.WM_IMPORT, wmconstants.MLFLOW_RUN_OBJECT + "_steps")

        start = timer()

        con = sqlite3.connect(mlflow_runs_file)
        cur = con.execute("SELECT * FROM runs")
        # TODO(kevin): make this configurable later
        runs = cur.fetchmany(10000)
        while (len(runs) > 0):
            with ThreadPoolExecutor(max_workers=num_parallel) as executor:
                # run_id = run[0]
                # start_time = run[1]
                # run_obj = json.loads(run[2])
                futures = [
                    executor.submit(self._create_run_and_log, src_client,
                                    mlflow_runs_file, run[0], run[1],
                                    json.loads(run[2]), experiment_id_map,
                                    self.export_dir + ml_run_artifacts_dir,
                                    error_logger, mlflow_runs_checkpointer,
                                    mlflow_runs_steps_checkpointer)
                    for run in runs
                ]
                concurrent.futures.wait(futures, return_when="FIRST_EXCEPTION")
                propagate_exceptions(futures)

            runs = cur.fetchmany(10000)
        shutil.copy(mlflow_runs_checkpointer.get_file_path(),
                    self.export_dir + run_id_map_log)
        con.close()
        end = timer()
        logging.info("Complete MLflow Runs Import Time: " +
                     str(timedelta(end - start)))
コード例 #10
0
        def _upload_all_files(root, subdirs, files):
            '''
            Upload all files in parallel in root (current) directory.
            '''
            # replace the local directory with empty string to get the notebook workspace directory
            nb_dir = '/' + root.replace(src_dir, '')
            upload_dir = nb_dir
            if not nb_dir == '/':
                upload_dir = nb_dir + '/'
            if self.is_user_ws_item(upload_dir):
                ws_user = self.get_user(upload_dir)
                if archive_missing:
                    if ws_user in archive_users:
                        upload_dir = upload_dir.replace('Users', 'Archive', 1)
                    elif not self.does_user_exist(ws_user):
                        # add the user to the cache / set of missing users
                        logging.info(
                            "User workspace does not exist, adding to archive cache: {0}"
                            .format(ws_user))
                        archive_users.add(ws_user)
                        # append the archive path to the upload directory
                        upload_dir = upload_dir.replace('Users', 'Archive', 1)
                    else:
                        logging.info(
                            "User workspace exists: {0}".format(ws_user))
                elif not self.does_user_exist(ws_user):
                    logging.info(
                        "User {0} is missing. "
                        "Please re-run with --archive-missing flag "
                        "or first verify all users exist in the new workspace".
                        format(ws_user))
                    return
                else:
                    logging.info("Uploading for user: {0}".format(ws_user))
            # make the top level folder before uploading files within the loop
            if not self.is_user_ws_root(upload_dir):
                # if it is not the /Users/[email protected]/ root path, don't create the folder
                resp_mkdirs = self.post(WS_MKDIRS, {'path': upload_dir})
                if 'error_code' in resp_mkdirs:
                    resp_mkdirs['path'] = upload_dir
                    logging_utils.log_reponse_error(error_logger, resp_mkdirs)

            def _file_upload_helper(f):
                logging.info("Uploading: {0}".format(f))
                # create the local file path to load the DBC file
                local_file_path = os.path.join(root, f)
                # create the ws full file path including filename
                ws_file_path = upload_dir + f
                if checkpoint_notebook_set.contains(ws_file_path):
                    return
                # generate json args with binary data for notebook to upload to the workspace path
                nb_input_args = self.get_user_import_args(
                    local_file_path, ws_file_path)
                # call import to the workspace
                if self.is_verbose():
                    logging.info("Path: {0}".format(nb_input_args['path']))
                resp_upload = self.post(WS_IMPORT, nb_input_args)
                if 'error_code' in resp_upload:
                    resp_upload['path'] = ws_file_path
                    logging.info(f'Error uploading file: {ws_file_path}')
                    logging_utils.log_reponse_error(error_logger, resp_upload)
                else:
                    checkpoint_notebook_set.write(ws_file_path)

            with ThreadPoolExecutor(max_workers=num_parallel) as executor:
                futures = [
                    executor.submit(_file_upload_helper, file)
                    for file in files
                ]
                concurrent.futures.wait(futures, return_when="FIRST_EXCEPTION")
                propagate_exceptions(futures)
コード例 #11
0
    def import_all_workspace_items(self,
                                   artifact_dir='artifacts/',
                                   archive_missing=False,
                                   num_parallel=4):
        """
        import all notebooks into a new workspace. Walks the entire artifacts/ directory in parallel, and also
        upload all the files in each of the directories in parallel.

        WARNING: Because it parallelizes both on directory walking and file uploading, it can spawn as many threads as
                 num_parallel * num_parallel

        :param artifact_dir: notebook download directory
        :param failed_log: failed import log
        :param archive_missing: whether to put missing users into a /Archive/ top level directory
        """
        src_dir = self.get_export_dir() + artifact_dir
        error_logger = logging_utils.get_error_logger(
            wmconstants.WM_IMPORT, wmconstants.WORKSPACE_NOTEBOOK_OBJECT,
            self.get_export_dir())

        checkpoint_notebook_set = self._checkpoint_service.get_checkpoint_key_set(
            wmconstants.WM_IMPORT, wmconstants.WORKSPACE_NOTEBOOK_OBJECT)
        num_exported_users = self.get_num_of_saved_users(src_dir)
        num_current_users = self.get_current_users()
        if num_current_users == 0:
            logging.info(
                "No registered users in existing environment. Please import users / groups first."
            )
            raise ValueError("No registered users in the current environment")
        if (num_current_users < num_exported_users) and (not archive_missing):
            logging.info("Exported number of user workspaces: {0}".format(
                num_exported_users))
            logging.info("Current number of user workspaces: {0}".format(
                num_current_users))
            logging.info(
                "Re-run with the `--archive-missing` flag to load missing users into a separate directory"
            )
            raise ValueError(
                "Current number of users is less than number of user workspaces to import."
            )
        archive_users = set()

        def _upload_all_files(root, subdirs, files):
            '''
            Upload all files in parallel in root (current) directory.
            '''
            # replace the local directory with empty string to get the notebook workspace directory
            nb_dir = '/' + root.replace(src_dir, '')
            upload_dir = nb_dir
            if not nb_dir == '/':
                upload_dir = nb_dir + '/'
            if self.is_user_ws_item(upload_dir):
                ws_user = self.get_user(upload_dir)
                if archive_missing:
                    if ws_user in archive_users:
                        upload_dir = upload_dir.replace('Users', 'Archive', 1)
                    elif not self.does_user_exist(ws_user):
                        # add the user to the cache / set of missing users
                        logging.info(
                            "User workspace does not exist, adding to archive cache: {0}"
                            .format(ws_user))
                        archive_users.add(ws_user)
                        # append the archive path to the upload directory
                        upload_dir = upload_dir.replace('Users', 'Archive', 1)
                    else:
                        logging.info(
                            "User workspace exists: {0}".format(ws_user))
                elif not self.does_user_exist(ws_user):
                    logging.info(
                        "User {0} is missing. "
                        "Please re-run with --archive-missing flag "
                        "or first verify all users exist in the new workspace".
                        format(ws_user))
                    return
                else:
                    logging.info("Uploading for user: {0}".format(ws_user))
            # make the top level folder before uploading files within the loop
            if not self.is_user_ws_root(upload_dir):
                # if it is not the /Users/[email protected]/ root path, don't create the folder
                resp_mkdirs = self.post(WS_MKDIRS, {'path': upload_dir})
                if 'error_code' in resp_mkdirs:
                    resp_mkdirs['path'] = upload_dir
                    logging_utils.log_reponse_error(error_logger, resp_mkdirs)

            def _file_upload_helper(f):
                logging.info("Uploading: {0}".format(f))
                # create the local file path to load the DBC file
                local_file_path = os.path.join(root, f)
                # create the ws full file path including filename
                ws_file_path = upload_dir + f
                if checkpoint_notebook_set.contains(ws_file_path):
                    return
                # generate json args with binary data for notebook to upload to the workspace path
                nb_input_args = self.get_user_import_args(
                    local_file_path, ws_file_path)
                # call import to the workspace
                if self.is_verbose():
                    logging.info("Path: {0}".format(nb_input_args['path']))
                resp_upload = self.post(WS_IMPORT, nb_input_args)
                if 'error_code' in resp_upload:
                    resp_upload['path'] = ws_file_path
                    logging.info(f'Error uploading file: {ws_file_path}')
                    logging_utils.log_reponse_error(error_logger, resp_upload)
                else:
                    checkpoint_notebook_set.write(ws_file_path)

            with ThreadPoolExecutor(max_workers=num_parallel) as executor:
                futures = [
                    executor.submit(_file_upload_helper, file)
                    for file in files
                ]
                concurrent.futures.wait(futures, return_when="FIRST_EXCEPTION")
                propagate_exceptions(futures)

        with ThreadPoolExecutor(max_workers=num_parallel) as executor:
            futures = [
                executor.submit(_upload_all_files, walk[0], walk[1], walk[2])
                for walk in self.walk(src_dir)
            ]
            concurrent.futures.wait(futures, return_when="FIRST_EXCEPTION")
            propagate_exceptions(futures)