def test_log_exception(self, mock_config_file): """Test that the logging of exceptions appear as expected. """ with patch('gtmcore.logging.LMLogger.CONFIG_INSTALLED_LOCATION', new_callable=PropertyMock, return_value=mock_config_file): # NOTE!! This should be the standard way to load the logger in each package. # Do NOT use LMLogger().logger because you will lose the stack context and knowledge # of source package, method, and line. lmlog = LMLogger() logger = LMLogger.get_logger() try: 1 / 0 except ZeroDivisionError as e: logger.exception(e) with open(lmlog.log_file, 'rt') as test_file: data = test_file.readlines() for d in data: # Make sure it's JSON parseable assert json.loads(d) # Note sometimes 'exec_info' returns None so we have to do (... or "") to make it an iterable. assert any([ 'Traceback (most recent call last)' in (d.get('exc_info') or "") for d in [json.loads(x) for x in data if x] ])
def test_log_exception_using_error_to_log(self, mock_config_file): """Test that the logging of exceptions appear as expected. """ with patch('gtmcore.logging.LMLogger.CONFIG_INSTALLED_LOCATION', new_callable=PropertyMock, return_value=mock_config_file): lmlog = LMLogger() logger = LMLogger.get_logger() try: assert False except AssertionError as e: # Note, using exc_info=True additionally prints the current stack context. logger.error(e, exc_info=True) with open(lmlog.log_file, 'rt') as test_file: data = test_file.readlines() found = False for d in data: # Make sure it's JSON parseable n = json.loads(d) assert n, "Loaded JSON entry must not be None" assert any([ 'AssertionError' in (d['exc_info'] or "") for d in [json.loads(x) for x in data if x] ])
def publish_repository(repository: Repository, username: str, access_token: str, remote: Optional[str] = None, public: bool = False, id_token: str = None) -> None: p = os.getpid() logger = LMLogger.get_logger() logger.info(f"(Job {p}) Starting publish_repository({str(repository)})") def update_feedback(msg: str, has_failures: Optional[bool] = None, failure_detail: Optional[str] = None, percent_complete: Optional[float] = None): """Method to update the job's metadata and provide feedback to the UI""" current_job = get_current_job() if not current_job: return if has_failures: current_job.meta['has_failures'] = has_failures if failure_detail: current_job.meta['failure_detail'] = failure_detail if percent_complete: current_job.meta['percent_complete'] = percent_complete current_job.meta['feedback'] = msg current_job.save_meta() logger = LMLogger.get_logger() try: update_feedback("Publish task in queue") with repository.lock(): if isinstance(repository, LabBook): wf = LabbookWorkflow(repository) else: wf = DatasetWorkflow(repository) # type: ignore wf.publish(username=username, access_token=access_token, remote=remote or "origin", public=public, feedback_callback=update_feedback, id_token=id_token) except IOError: raise except Exception as e: logger.exception(e) raise Exception("Could not publish - try to log out and log in again.")
def sync_repository(repository: Repository, username: str, override: MergeOverride, remote: str = "origin", access_token: str = None, pull_only: bool = False, id_token: str = None) -> int: p = os.getpid() logger = LMLogger.get_logger() logger.info(f"(Job {p}) Starting sync_repository({str(repository)})") def update_meta(msg): job = get_current_job() if not job: return if 'feedback' not in job.meta: job.meta['feedback'] = msg else: job.meta['feedback'] = job.meta['feedback'] + f'\n{msg}' job.save_meta() try: with repository.lock(): if isinstance(repository, LabBook): wf = LabbookWorkflow(repository) else: wf = DatasetWorkflow(repository) # type: ignore cnt = wf.sync(username=username, remote=remote, override=override, feedback_callback=update_meta, access_token=access_token, id_token=id_token, pull_only=pull_only) logger.info(f"(Job {p} Completed sync_repository with cnt={cnt}") return cnt except Exception as e: logger.exception(f"(Job {p}) Error on sync_repository: {e}") raise
def publish_repository(repository: Repository, username: str, access_token: str, remote: Optional[str] = None, public: bool = False, id_token: str = None) -> None: p = os.getpid() logger = LMLogger.get_logger() logger.info(f"(Job {p}) Starting publish_repository({str(repository)})") def update_meta(msg): job = get_current_job() if not job: return if 'feedback' not in job.meta: job.meta['feedback'] = msg else: job.meta['feedback'] = job.meta['feedback'] + f'\n{msg}' job.save_meta() try: with repository.lock(): if isinstance(repository, LabBook): wf = LabbookWorkflow(repository) else: wf = DatasetWorkflow(repository) # type: ignore wf.publish(username=username, access_token=access_token, remote=remote or "origin", public=public, feedback_callback=update_meta, id_token=id_token) except Exception as e: logger.exception(f"(Job {p}) Error on publish_repository: {e}") raise
def start_and_run_activity_monitor(module_name, class_name, user, owner, labbook_name, monitor_key, author_name, author_email, session_metadata): """Run method to run the activity monitor. It is a long running job. Args: Returns: 0 to indicate no failure """ logger = LMLogger.get_logger() logger.info("Starting Activity Monitor `{}` in PID {}".format(class_name, os.getpid())) try: # Import the monitor class m = importlib.import_module(module_name) # get the class monitor_cls = getattr(m, class_name) # Instantiate monitor class monitor = monitor_cls(user, owner, labbook_name, monitor_key, author_name=author_name, author_email=author_email) # Start the monitor monitor.start(session_metadata) return 0 except Exception as e: logger.error("Error on start_and_run_activity_monitor in pid {}: {}".format(os.getpid(), e)) raise e
def test_init(self, mock_config_file): """Test loading a config file explicitly""" lmlog = LMLogger(mock_config_file) assert type(lmlog) is LMLogger assert lmlog.config_file is mock_config_file assert type(lmlog.logger) is logging.Logger
def start_labbook_container(root: str, config_path: str, username: Optional[str] = None, override_image_id: Optional[str] = None) -> str: """Return the ID of the LabBook Docker container ID. Args: root: Root directory of labbook config_path: Path to config file (labbook.client_config.config_file) username: Username of active user override_image_id: Force using this name of docker image (do not infer) Returns: Docker container ID """ logger = LMLogger.get_logger() logger.info(f"Starting start_labbook_container(root={root}, config_path={config_path}, username={username}, " f"override_image_id={override_image_id}) in pid {os.getpid()}") try: c_id = start_container(labbook_root=root, config_path=config_path, override_image_id=override_image_id, username=username) logger.info(f"Completed start_labbook_container in pid {os.getpid()}: {c_id}") return c_id except Exception as e: logger.error("Error on launch_docker_container in pid {}: {}".format(os.getpid(), e)) raise
def test_log(self, mock_config_file): """Test logging""" with patch('gtmcore.logging.LMLogger.CONFIG_INSTALLED_LOCATION', new_callable=PropertyMock, return_value=mock_config_file): lmlog = LMLogger() assert type(lmlog) is LMLogger assert lmlog.config_file is mock_config_file assert type(lmlog.logger) is logging.Logger logger = lmlog.logger logger.debug("##DE_BUG##") logger.info("##IN_FO##") logger.warning("##WA_RN##") logger.error("##ER_ROR##") with open(lmlog.log_file, 'rt') as test_file: data = [json.loads(s) for s in test_file.readlines()] assert all([d['filename'] == 'test_logging.py' for d in data]) assert "##IN_FO##" in data[0]['message'] assert "INFO" == data[0]['levelname'] assert "##WA_RN##" in data[1]['message'] assert "WARNING" == data[1]['levelname'] assert "##ER_ROR##" in data[2]['message'] assert "ERROR" == data[2]['levelname']
def run_dev_env_monitor(dev_env_name, key) -> int: """Run method to check if new Activity Monitors for a given dev env need to be started/stopped Args: dev_env_name(str): Name of the dev env to monitor key(str): The unique string used as the key in redis to track this DevEnvMonitor instance Returns: 0 to indicate no failure """ logger = LMLogger.get_logger() logger.debug( "Checking Dev Env `{}` for activity monitors in PID {}".format( dev_env_name, os.getpid())) try: demm = DevEnvMonitorManager() dev_env = demm.get_monitor_instance(dev_env_name) if not dev_env: raise ValueError('dev_env is None') dev_env.run(key) return 0 except Exception as e: logger.error("Error on run_dev_env_monitor in pid {}: {}".format( os.getpid(), e)) raise e
def import_labbook_from_remote(remote_url: str, username: str, config_file: str = None) -> str: """Return the root directory of the newly imported Project""" p = os.getpid() logger = LMLogger.get_logger() logger.info( f"(Job {p}) Starting import_labbook_from_remote({remote_url}, {username})" ) def update_meta(msg): job = get_current_job() if not job: return if 'feedback' not in job.meta: job.meta['feedback'] = msg else: job.meta['feedback'] = job.meta['feedback'] + f'\n{msg}' job.save_meta() try: toks = remote_url.split("/") if len(toks) > 1: proj_path = f'{toks[-2]}/{toks[-1].replace(".git", "")}' else: proj_path = remote_url update_meta(f"Importing Project from {proj_path!r}...") wf = LabbookWorkflow.import_from_remote(remote_url, username, config_file) update_meta(f"Imported Project {wf.labbook.name}!") return wf.labbook.root_dir except Exception as e: update_meta(f"Could not import Project from {remote_url}.") logger.exception(f"(Job {p}) Error on import_labbook_from_remote: {e}") raise
def test_init_load_from_install(self, mock_config_file): """Test loading the default file from the installed location""" with patch('gtmcore.logging.LMLogger.CONFIG_INSTALLED_LOCATION', new_callable=PropertyMock, return_value=mock_config_file): lmlog = LMLogger() assert type(lmlog) is LMLogger assert lmlog.config_file is mock_config_file assert type(lmlog.logger) is logging.Logger
def clean_dataset_file_cache(logged_in_username: str, dataset_owner: str, dataset_name: str, cache_location: str, config_file: str = None) -> None: """Method to import a dataset from a zip file Args: logged_in_username: username for the currently logged in user dataset_owner: Owner of the labbook if this dataset is linked dataset_name: Name of the labbook if this dataset is linked cache_location: Absolute path to the file cache (inside the container) for this dataset config_file: Returns: None """ logger = LMLogger.get_logger() p = os.getpid() try: logger.info( f"(Job {p}) Starting clean_dataset_file_cache(logged_in_username={logged_in_username}," f"dataset_owner={dataset_owner}, dataset_name={dataset_name}") im = InventoryManager(config_file=config_file) # Check for dataset try: im.load_dataset(logged_in_username, dataset_owner, dataset_name) logger.info( f"{logged_in_username}/{dataset_owner}/{dataset_name} still exists. Skipping file cache clean." ) return except InventoryException: # Dataset not found, move along pass # Check for submodule references for lb in im.list_labbooks(logged_in_username): for ds in im.get_linked_datasets(lb): if ds.namespace == dataset_owner and ds.name == dataset_name: logger.info( f"{logged_in_username}/{dataset_owner}/{dataset_name} still referenced by {str(lb)}." f" Skipping file cache clean.") return # If you get here the dataset no longer exists and is not used by any projects, clear files shutil.rmtree(cache_location) except Exception as err: logger.error(f"(Job {p}) Error in clean_dataset_file_cache job") logger.exception(err) raise
def update_unmanaged_dataset_from_remote(logged_in_username: str, access_token: str, id_token: str, dataset_owner: str, dataset_name: str) -> None: """Method to update/populate an unmanaged dataset from its remote automatically Args: logged_in_username: username for the currently logged in user access_token: bearer token id_token: identity token dataset_owner: Owner of the dataset containing the files to download dataset_name: Name of the dataset containing the files to download Returns: """ def update_meta(msg): job = get_current_job() if not job: return if 'feedback' not in job.meta: job.meta['feedback'] = msg else: job.meta['feedback'] = job.meta['feedback'] + f'\n{msg}' job.save_meta() logger = LMLogger.get_logger() try: p = os.getpid() logger.info( f"(Job {p}) Starting update_unmanaged_dataset_from_remote(logged_in_username={logged_in_username}," f"dataset_owner={dataset_owner}, dataset_name={dataset_name}") im = InventoryManager() ds = im.load_dataset(logged_in_username, dataset_owner, dataset_name) ds.namespace = dataset_owner ds.backend.set_default_configuration(logged_in_username, access_token, id_token) if not isinstance(ds.backend, UnmanagedStorageBackend): raise ValueError("Can only auto-update unmanaged dataset types") if not ds.backend.can_update_from_remote: raise ValueError( "Storage backend cannot update automatically from remote.") ds.backend.update_from_remote(ds, update_meta) except Exception as err: logger.exception(err) raise
def sync_repository(repository: Repository, username: str, override: MergeOverride, remote: str = "origin", access_token: str = None, pull_only: bool = False, id_token: str = None) -> int: p = os.getpid() logger = LMLogger.get_logger() logger.info(f"(Job {p}) Starting sync_repository({str(repository)})") def update_feedback(msg: str, has_failures: Optional[bool] = None, failure_detail: Optional[str] = None, percent_complete: Optional[float] = None): """Method to update the job's metadata and provide feedback to the UI""" current_job = get_current_job() if not current_job: return if has_failures: current_job.meta['has_failures'] = has_failures if failure_detail: current_job.meta['failure_detail'] = failure_detail if percent_complete: current_job.meta['percent_complete'] = percent_complete current_job.meta['feedback'] = msg current_job.save_meta() try: update_feedback("Sync task in queue") with repository.lock(): if isinstance(repository, LabBook): wf = LabbookWorkflow(repository) else: wf = DatasetWorkflow(repository) # type: ignore cnt = wf.sync(username=username, remote=remote, override=override, feedback_callback=update_feedback, access_token=access_token, id_token=id_token, pull_only=pull_only) logger.info(f"(Job {p} Completed sync_repository with cnt={cnt}") return cnt except MergeConflict as me: logger.exception(f"(Job {p}) Merge conflict: {me}") raise except IOError: raise except Exception as e: logger.exception(e) raise Exception("Could not sync - try to log out and log in again.")
def test_init_load_from_package(self): """Test loading the default file from the package""" lmlog = LMLogger() assert type(lmlog) is LMLogger if os.path.exists(LMLogger.CONFIG_INSTALLED_LOCATION): assert lmlog.config_file.rsplit("/", 1)[1] == "logging.json" else: assert lmlog.config_file.rsplit("/", 1)[1] == "logging.json.default" assert type(lmlog.logger) is logging.Logger
def export_dataset_as_zip(dataset_path: str, ds_export_directory: str) -> str: """Return path to archive file of exported dataset. """ p = os.getpid() logger = LMLogger.get_logger() logger.info(f"(Job {p}) Starting export_dataset_as_zip({dataset_path})") try: ds = InventoryManager().load_dataset_from_directory(dataset_path) with ds.lock(): path = ZipExporter.export_dataset(ds.root_dir, ds_export_directory) return path except Exception as e: logger.exception(f"(Job {p}) Error on export_dataset_as_zip: {e}") raise
def export_labbook_as_zip(labbook_path: str, lb_export_directory: str) -> str: """Return path to archive file of exported labbook. """ p = os.getpid() logger = LMLogger.get_logger() logger.info(f"(Job {p}) Starting export_labbook_as_zip({labbook_path})") try: lb = InventoryManager().load_labbook_from_directory(labbook_path) with lb.lock(): path = ZipExporter.export_labbook(lb.root_dir, lb_export_directory) return path except Exception as e: logger.exception(f"(Job {p}) Error on export_labbook_as_zip: {e}") raise
def build_labbook_image(path: str, username: str, tag: Optional[str] = None, nocache: bool = False) -> str: """Return a docker image ID of given LabBook. Args: path: Pass-through arg to labbook root. username: Username of active user. tag: Pass-through arg to tag of docker image. nocache(bool): Pass-through arg to docker build. Returns: Docker image ID """ logger = LMLogger.get_logger() logger.info( f"Starting build_labbook_image({path}, {username}, {tag}, {nocache}) in pid {os.getpid()}" ) try: job = get_current_job() if job: job.meta['pid'] = os.getpid() job.save_meta() def save_metadata_callback(line: str) -> None: try: if not line: return job.meta['feedback'] = (job.meta.get('feedback') or '') + line + '\n' job.save_meta() except Exception as e: logger.error(e) save_metadata_callback("Build task in queue") image_id = build_image(path, override_image_tag=tag, nocache=nocache, username=username, feedback_callback=save_metadata_callback) logger.info( f"Completed build_labbook_image in pid {os.getpid()}: {image_id}") return image_id except Exception as e: logger.error(f"Error on build_labbook_image in pid {os.getpid()}: {e}") raise
def test_sleep(n): """Used only for testing -- example method with argument. """ logger = LMLogger.get_logger() logger.info("Starting test_sleep({}) in pid {}".format(n, os.getpid())) try: job = get_current_job() job.meta['sample'] = 'test_sleep metadata' job.save_meta() time.sleep(n) logger.info("Completed test_sleep in pid {}".format(os.getpid())) return 0 except Exception as e: logger.error("Error on test_sleep in pid {}: {}".format(os.getpid(), e)) raise
def test_load_logger_by_name(self, mock_config_file): """Test loading the logger by name rather than by LMLogger.logger. """ with patch('gtmcore.logging.LMLogger.CONFIG_INSTALLED_LOCATION', new_callable=PropertyMock, return_value=mock_config_file): lmlog = LMLogger() logger = logging.getLogger("labmanager") logger.warning('test_load_logger_by_name') with open(lmlog.log_file, 'rt') as test_file: data = test_file.readlines() for d in data: # Make sure it's JSON parseable assert json.loads(d) assert any(['test_load_logger_by_name' in d for d in data])
def get_logged_in_username() -> str: """A Method to get the current logged in user's username Returns: str """ user = get_logged_in_user() if not user: logger = LMLogger() logger.logger.error( "Failed to load a user identity from request context.") raise ValueError( "Failed to load a user identity from request context.") return user.username
def get_logged_in_author(): """A Method to get the current logged in user's GitAuthor instance Returns: GitAuthor """ user = get_logged_in_user() if not user: logger = LMLogger() logger.logger.error( "Failed to load a user identity from request context.") raise ValueError( "Failed to load a user identity from request context.") # Create a GitAuthor instance if possible return GitAuthor(name=user.username, email=user.email)
def hash_dataset_files(logged_in_username: str, dataset_owner: str, dataset_name: str, file_list: List, config_file: str = None) -> None: """ Args: logged_in_username: username for the currently logged in user dataset_owner: Owner of the labbook if this dataset is linked dataset_name: Name of the labbook if this dataset is linked file_list: List of files to be hashed config_file: Optional config file to use Returns: None """ logger = LMLogger.get_logger() p = os.getpid() try: logger.info( f"(Job {p}) Starting hash_dataset_files(logged_in_username={logged_in_username}," f"dataset_owner={dataset_owner}, dataset_name={dataset_name}") ds = InventoryManager(config_file=config_file).load_dataset( logged_in_username, dataset_owner, dataset_name) manifest = Manifest(ds, logged_in_username) hash_result, fast_hash_result = manifest.hash_files(file_list) job = get_current_job() if job: job.meta['hash_result'] = ",".join( ['None' if v is None else v for v in hash_result]) job.meta['fast_hash_result'] = ",".join( ['None' if v is None else v for v in fast_hash_result]) job.save_meta() except Exception as err: logger.error(f"(Job {p}) Error in clean_dataset_file_cache job") logger.exception(err) raise
def import_dataset_from_zip(archive_path: str, username: str, owner: str, config_file: Optional[str] = None) -> str: """Method to import a dataset from a zip file Args: archive_path(str): Path to the uploaded zip username(str): Username owner(str): Owner username config_file(str): Optional path to a labmanager config file Returns: str: directory path of imported labbook """ def update_meta(msg): job = get_current_job() if not job: return job.meta['feedback'] = msg job.save_meta() p = os.getpid() logger = LMLogger.get_logger() logger.info( f"(Job {p}) Starting import_dataset_from_zip(archive_path={archive_path}," f"username={username}, owner={owner}, config_file={config_file})") try: lb = ZipExporter.import_dataset(archive_path, username, owner, config_file=config_file, update_meta=update_meta) return lb.root_dir except Exception as e: logger.exception( f"(Job {p}) Error on import_dataset_from_zip({archive_path}): {e}") raise finally: if os.path.exists(archive_path): os.remove(archive_path)
def stop_labbook_container(container_id: str): """Return a dictionary of metadata pertaining to the given task's Redis key. TODO - Take labbook as argument rather than image tag. Args: image_tag(str): Container to stop Returns: 0 to indicate no failure """ logger = LMLogger.get_logger() logger.info(f"Starting stop_labbook_container({container_id}) in pid {os.getpid()}") try: stop_container(container_id) return 0 except Exception as e: logger.error("Error on stop_labbook_container in pid {}: {}".format(os.getpid(), e)) raise
def test_incr(path): logger = LMLogger.get_logger() logger.info("Starting test_incr({}) in pid {}".format(path, os.getpid())) try: amt = 1 if not os.path.exists(path): logger.info("Creating {}".format(path)) with open(path, 'w') as fp: json.dump({'amt': amt}, fp) else: logger.info("Loading {}".format(path)) with open(path, 'r') as fp: amt_dict = json.load(fp) logger.info("Amt = {}") with open(path, 'w') as fp: amt_dict['amt'] = amt_dict['amt'] + 1 json.dump(amt_dict, fp) logger.info("Set amt = {} in {}".format(amt_dict['amt'], path)) except Exception as e: logger.error("Error on test_incr in pid {}: {}".format(os.getpid(), e)) raise
import graphene from confhttpproxy import ProxyRouter from gtmcore.inventory.inventory import InventoryManager from gtmcore.labbook.labbook import LabBook from gtmcore.exceptions import GigantumException from gtmcore.container.container import ContainerOperations from gtmcore.mitmproxy.mitmproxy import MITMProxyOperations from gtmcore.container.jupyter import check_jupyter_reachable, start_jupyter from gtmcore.container.rserver import start_rserver from gtmcore.logging import LMLogger from gtmcore.activity.services import start_labbook_monitor from lmsrvcore.auth.user import get_logged_in_username, get_logged_in_author logger = LMLogger.get_logger() def unique_id() -> str: """This is used to, e.g., identify a specific running labbook. This allows us to link things like activity monitors, etc. It can safely be improved or changed, as consumers should only expect some "random" string.""" return uuid.uuid4().hex[:10] class StartDevTool(graphene.relay.ClientIDMutation): class Input: owner = graphene.String(required=True) labbook_name = graphene.String(required=True) dev_tool = graphene.String(required=True)
def complete_dataset_upload_transaction(logged_in_username: str, logged_in_email: str, dataset_owner: str, dataset_name: str, dispatcher, config_file: str = None) -> None: """Method to import a dataset from a zip file Args: logged_in_username: username for the currently logged in user logged_in_email: email for the currently logged in user dataset_owner: Owner of the labbook if this dataset is linked dataset_name: Name of the labbook if this dataset is linked dispatcher: Reference to the dispatcher CLASS config_file: config file (used for test mocking) Returns: None """ logger = LMLogger.get_logger() dispatcher_obj = dispatcher() def update_feedback(msg: str, has_failures: Optional[bool] = None, failure_detail: Optional[str] = None, percent_complete: Optional[float] = None): """Method to update the job's metadata and provide feedback to the UI""" current_job = get_current_job() if not current_job: return if has_failures: current_job.meta['has_failures'] = has_failures if failure_detail: current_job.meta['failure_detail'] = failure_detail if percent_complete: current_job.meta['percent_complete'] = percent_complete current_job.meta['feedback'] = msg current_job.save_meta() def schedule_bg_hash_job(): """Method to check if a bg job should get scheduled and do so""" num_cores = manifest.get_num_hashing_cpus() if sum([x.is_running for x in job_list]) < num_cores: for j in job_list: if j.is_failed is True and j.failure_count < 3: # Re-schedule failed job job_kwargs['file_list'] = j.file_list job_key = dispatcher_obj.dispatch_task( hash_dataset_files, kwargs=job_kwargs, metadata=job_metadata) j.job_key = job_key update_feedback( f"Restarted failed file hashing job. Re-processing" f" {format_size(j.total_bytes)}...") logger.info( f"(Job {p}) Restarted file hash job for" f" {logged_in_username}/{dataset_owner}/{dataset_name}" ) break if j.is_complete is False and j.is_running is False: # Schedule new job job_kwargs['file_list'] = j.file_list job_key = dispatcher_obj.dispatch_task( hash_dataset_files, kwargs=job_kwargs, metadata=job_metadata) j.job_key = job_key logger.info( f"(Job {p}) Scheduled file hash job for" f" {logged_in_username}/{dataset_owner}/{dataset_name}" ) break p = os.getpid() try: logger.info( f"(Job {p}) Starting complete_dataset_upload_transaction(logged_in_username={logged_in_username}," f"dataset_owner={dataset_owner}, dataset_name={dataset_name}") author = GitAuthor(name=logged_in_username, email=logged_in_email) dispatcher_obj = Dispatcher() ds = InventoryManager(config_file=config_file).load_dataset( logged_in_username, dataset_owner, dataset_name, author=author) manifest = Manifest(ds, logged_in_username) with ds.lock(): # Detect changes status = manifest.status() # Collect filenames that need to be hashed filenames = copy.deepcopy(status.modified) filenames.extend(status.created) # If there are new/updated files, spread work across cores while providing reasonable feedback if filenames: job_list = generate_bg_hash_job_list(filenames, manifest, dispatcher_obj) total_bytes = sum([x.total_bytes for x in job_list]) job_kwargs = { 'logged_in_username': logged_in_username, 'dataset_owner': dataset_owner, 'dataset_name': dataset_name, 'file_list': list(), 'config_file': config_file, } job_metadata = { 'dataset': f"{logged_in_username}|{dataset_owner}|{dataset_name}", 'method': 'hash_dataset_files' } update_feedback( f"Please wait while file contents are analyzed. " f"Processing {format_size(total_bytes)}...", has_failures=False) logger.info( f"(Job {p}) Starting file hash processing for" f" {logged_in_username}/{dataset_owner}/{dataset_name} with {len(job_list)} jobs" ) while True: # Check if you need to schedule jobs and schedule up to 1 job per iteration schedule_bg_hash_job() # Refresh all job statuses and update status feedback completed_job_status = [ x.refresh_status() for x in job_list ] completed_bytes = sum([ s.total_bytes for s, c in zip(job_list, completed_job_status) if c is True ]) update_feedback( f"Please wait while file contents are analyzed. " f"{format_size(completed_bytes)} of {format_size(total_bytes)} complete...", percent_complete=(float(completed_bytes) / float(total_bytes)) * 100) # Check if you are done completed_or_failed = sum([(x.is_complete or (x.failure_count >= 3)) for x in job_list]) if completed_or_failed == len(job_list): break # Update once per second time.sleep(1) # Manually complete update process for updated/created files failed_files = list() for job in job_list: if job.is_complete: for f, h, fh in zip(job.file_list, job.get_hash_result(), job.get_fast_hash_result()): if not fh or not h: failed_files.append(f) continue _, file_bytes, mtime = fh.split("||") manifest._manifest_io.add_or_update( f, h, mtime, file_bytes) else: failed_files.extend(job.file_list) # Message for hard failures if failed_files: detail_msg = f"The following files failed to hash. Try re-uploading the files again:\n" detail_file_list = " \n".join(failed_files) detail_msg = f"{detail_msg}{detail_file_list}" update_feedback( f"An error occurred while processing some files. Check details and re-upload.", has_failures=True, failure_detail=detail_msg) if status.deleted: manifest.hasher.delete_fast_hashes(status.deleted) for relative_path in status.deleted: manifest._manifest_io.remove(relative_path) manifest._manifest_io.persist() # Complete sweep operation manifest.sweep_all_changes(status=status, upload=True) except Exception as err: logger.error(f"(Job {p}) Error in clean_dataset_file_cache job") logger.exception(err) raise
def verify_dataset_contents(logged_in_username: str, access_token: str, id_token: str, dataset_owner: str, dataset_name: str, labbook_owner: Optional[str] = None, labbook_name: Optional[str] = None) -> None: """Method to update/populate an unmanaged dataset from it local state Args: logged_in_username: username for the currently logged in user access_token: bearer token id_token: identity token dataset_owner: Owner of the dataset containing the files to download dataset_name: Name of the dataset containing the files to download labbook_owner: Owner of the labbook if this dataset is linked labbook_name: Name of the labbook if this dataset is linked Returns: None """ job = get_current_job() def update_meta(msg): if not job: return if 'feedback' not in job.meta: job.meta['feedback'] = msg else: job.meta['feedback'] = job.meta['feedback'] + f'\n{msg}' job.save_meta() logger = LMLogger.get_logger() try: p = os.getpid() logger.info( f"(Job {p}) Starting verify_dataset_contents(logged_in_username={logged_in_username}," f"dataset_owner={dataset_owner}, dataset_name={dataset_name}," f"labbook_owner={labbook_owner}, labbook_name={labbook_name}") im = InventoryManager() if labbook_owner is not None and labbook_name is not None: # This is a linked dataset, load repo from the Project lb = im.load_labbook(logged_in_username, labbook_owner, labbook_name) dataset_dir = os.path.join(lb.root_dir, '.gigantum', 'datasets', dataset_owner, dataset_name) ds = im.load_dataset_from_directory(dataset_dir) else: # this is a normal dataset. Load repo from working dir ds = im.load_dataset(logged_in_username, dataset_owner, dataset_name) ds.namespace = dataset_owner ds.backend.set_default_configuration(logged_in_username, access_token, id_token) result = ds.backend.verify_contents(ds, update_meta) job.meta['modified_keys'] = result except Exception as err: logger.exception(err) raise