Ejemplo n.º 1
0
def publish_repository(repository: Repository,
                       username: str,
                       access_token: str,
                       remote: Optional[str] = None,
                       public: bool = False,
                       id_token: str = None) -> None:
    p = os.getpid()
    logger = LMLogger.get_logger()
    logger.info(f"(Job {p}) Starting publish_repository({str(repository)})")

    def update_feedback(msg: str,
                        has_failures: Optional[bool] = None,
                        failure_detail: Optional[str] = None,
                        percent_complete: Optional[float] = None):
        """Method to update the job's metadata and provide feedback to the UI"""
        current_job = get_current_job()
        if not current_job:
            return
        if has_failures:
            current_job.meta['has_failures'] = has_failures
        if failure_detail:
            current_job.meta['failure_detail'] = failure_detail
        if percent_complete:
            current_job.meta['percent_complete'] = percent_complete

        current_job.meta['feedback'] = msg
        current_job.save_meta()

    logger = LMLogger.get_logger()

    try:
        update_feedback("Publish task in queue")
        with repository.lock():
            if isinstance(repository, LabBook):
                wf = LabbookWorkflow(repository)
            else:
                wf = DatasetWorkflow(repository)  # type: ignore
            wf.publish(username=username,
                       access_token=access_token,
                       remote=remote or "origin",
                       public=public,
                       feedback_callback=update_feedback,
                       id_token=id_token)
    except IOError:
        raise
    except Exception as e:
        logger.exception(e)
        raise Exception("Could not publish - try to log out and log in again.")
Ejemplo n.º 2
0
def publish_repository(repository: Repository, username: str, access_token: str,
                       remote: Optional[str] = None, public: bool = False, id_token: str = None) -> None:
    p = os.getpid()
    logger = LMLogger.get_logger()
    logger.info(f"(Job {p}) Starting publish_repository({str(repository)})")

    def update_meta(msg):
        job = get_current_job()
        if not job:
            return
        if 'feedback' not in job.meta:
            job.meta['feedback'] = msg
        else:
            job.meta['feedback'] = job.meta['feedback'] + f'\n{msg}'
        job.save_meta()

    try:
        with repository.lock():
            if isinstance(repository, LabBook):
                wf = LabbookWorkflow(repository)
            else:
                wf = DatasetWorkflow(repository) # type: ignore
            wf.publish(username=username, access_token=access_token, remote=remote or "origin",
                       public=public, feedback_callback=update_meta, id_token=id_token)

    except Exception as e:
        logger.exception(f"(Job {p}) Error on publish_repository: {e}")
        raise
Ejemplo n.º 3
0
def run_dev_env_monitor(dev_env_name, key) -> int:
    """Run method to check if new Activity Monitors for a given dev env need to be started/stopped

        Args:
            dev_env_name(str): Name of the dev env to monitor
            key(str): The unique string used as the key in redis to track this DevEnvMonitor instance

    Returns:
        0 to indicate no failure
    """

    logger = LMLogger.get_logger()
    logger.debug(
        "Checking Dev Env `{}` for activity monitors in PID {}".format(
            dev_env_name, os.getpid()))

    try:
        demm = DevEnvMonitorManager()
        dev_env = demm.get_monitor_instance(dev_env_name)
        if not dev_env:
            raise ValueError('dev_env is None')
        dev_env.run(key)
        return 0
    except Exception as e:
        logger.error("Error on run_dev_env_monitor in pid {}: {}".format(
            os.getpid(), e))
        raise e
Ejemplo n.º 4
0
def sync_repository(repository: Repository, username: str, override: MergeOverride,
                    remote: str = "origin", access_token: str = None,
                    pull_only: bool = False, id_token: str = None) -> int:
    p = os.getpid()
    logger = LMLogger.get_logger()
    logger.info(f"(Job {p}) Starting sync_repository({str(repository)})")

    def update_meta(msg):
        job = get_current_job()
        if not job:
            return
        if 'feedback' not in job.meta:
            job.meta['feedback'] = msg
        else:
            job.meta['feedback'] = job.meta['feedback'] + f'\n{msg}'
        job.save_meta()

    try:
        with repository.lock():
            if isinstance(repository, LabBook):
                wf = LabbookWorkflow(repository)
            else:
                wf = DatasetWorkflow(repository) # type: ignore
            cnt = wf.sync(username=username, remote=remote, override=override,
                          feedback_callback=update_meta, access_token=access_token,
                          id_token=id_token, pull_only=pull_only)
        logger.info(f"(Job {p} Completed sync_repository with cnt={cnt}")
        return cnt
    except Exception as e:
        logger.exception(f"(Job {p}) Error on sync_repository: {e}")
        raise
Ejemplo n.º 5
0
    def test_log_exception(self, mock_config_file):
        """Test that the logging of exceptions appear as expected. """
        with patch('gtmcore.logging.LMLogger.CONFIG_INSTALLED_LOCATION',
                   new_callable=PropertyMock,
                   return_value=mock_config_file):
            # NOTE!! This should be the standard way to load the logger in each package.
            # Do NOT use LMLogger().logger because you will lose the stack context and knowledge
            # of source package, method, and line.
            lmlog = LMLogger()
            logger = LMLogger.get_logger()

        try:
            1 / 0
        except ZeroDivisionError as e:
            logger.exception(e)

        with open(lmlog.log_file, 'rt') as test_file:
            data = test_file.readlines()
            for d in data:
                # Make sure it's JSON parseable
                assert json.loads(d)
            # Note sometimes 'exec_info' returns None so we have to do (... or "") to make it an iterable.
            assert any([
                'Traceback (most recent call last)' in (d.get('exc_info')
                                                        or "")
                for d in [json.loads(x) for x in data if x]
            ])
Ejemplo n.º 6
0
    def test_log_exception_using_error_to_log(self, mock_config_file):
        """Test that the logging of exceptions appear as expected. """
        with patch('gtmcore.logging.LMLogger.CONFIG_INSTALLED_LOCATION',
                   new_callable=PropertyMock,
                   return_value=mock_config_file):
            lmlog = LMLogger()
            logger = LMLogger.get_logger()

        try:
            assert False
        except AssertionError as e:
            # Note, using exc_info=True additionally prints the current stack context.
            logger.error(e, exc_info=True)

        with open(lmlog.log_file, 'rt') as test_file:
            data = test_file.readlines()

            found = False
            for d in data:
                # Make sure it's JSON parseable
                n = json.loads(d)
                assert n, "Loaded JSON entry must not be None"

            assert any([
                'AssertionError' in (d['exc_info'] or "")
                for d in [json.loads(x) for x in data if x]
            ])
Ejemplo n.º 7
0
def start_and_run_activity_monitor(module_name, class_name, user, owner, labbook_name, monitor_key, author_name,
                                   author_email, session_metadata):
    """Run method to run the activity monitor. It is a long running job.

        Args:


    Returns:
        0 to indicate no failure
    """
    logger = LMLogger.get_logger()
    logger.info("Starting Activity Monitor `{}` in PID {}".format(class_name, os.getpid()))

    try:
        # Import the monitor class
        m = importlib.import_module(module_name)
        # get the class
        monitor_cls = getattr(m, class_name)

        # Instantiate monitor class
        monitor = monitor_cls(user, owner, labbook_name, monitor_key,
                              author_name=author_name, author_email=author_email)

        # Start the monitor
        monitor.start(session_metadata)

        return 0
    except Exception as e:
        logger.error("Error on start_and_run_activity_monitor in pid {}: {}".format(os.getpid(), e))
        raise e
Ejemplo n.º 8
0
def import_labbook_from_remote(remote_url: str,
                               username: str,
                               config_file: str = None) -> str:
    """Return the root directory of the newly imported Project"""
    p = os.getpid()
    logger = LMLogger.get_logger()
    logger.info(
        f"(Job {p}) Starting import_labbook_from_remote({remote_url}, {username})"
    )

    def update_meta(msg):
        job = get_current_job()
        if not job:
            return
        if 'feedback' not in job.meta:
            job.meta['feedback'] = msg
        else:
            job.meta['feedback'] = job.meta['feedback'] + f'\n{msg}'
        job.save_meta()

    try:
        toks = remote_url.split("/")
        if len(toks) > 1:
            proj_path = f'{toks[-2]}/{toks[-1].replace(".git", "")}'
        else:
            proj_path = remote_url
        update_meta(f"Importing Project from {proj_path!r}...")
        wf = LabbookWorkflow.import_from_remote(remote_url, username,
                                                config_file)
        update_meta(f"Imported Project {wf.labbook.name}!")
        return wf.labbook.root_dir
    except Exception as e:
        update_meta(f"Could not import Project from {remote_url}.")
        logger.exception(f"(Job {p}) Error on import_labbook_from_remote: {e}")
        raise
Ejemplo n.º 9
0
def start_labbook_container(root: str, config_path: str, username: Optional[str] = None,
                            override_image_id: Optional[str] = None) -> str:
    """Return the ID of the LabBook Docker container ID.

    Args:
        root: Root directory of labbook
        config_path: Path to config file (labbook.client_config.config_file)
        username: Username of active user
        override_image_id: Force using this name of docker image (do not infer)

    Returns:
        Docker container ID
    """

    logger = LMLogger.get_logger()
    logger.info(f"Starting start_labbook_container(root={root}, config_path={config_path}, username={username}, "
                f"override_image_id={override_image_id}) in pid {os.getpid()}")

    try:
        c_id = start_container(labbook_root=root, config_path=config_path,
                               override_image_id=override_image_id, username=username)
        logger.info(f"Completed start_labbook_container in pid {os.getpid()}: {c_id}")
        return c_id
    except Exception as e:
        logger.error("Error on launch_docker_container in pid {}: {}".format(os.getpid(), e))
        raise
Ejemplo n.º 10
0
def clean_dataset_file_cache(logged_in_username: str,
                             dataset_owner: str,
                             dataset_name: str,
                             cache_location: str,
                             config_file: str = None) -> None:
    """Method to import a dataset from a zip file

    Args:
        logged_in_username: username for the currently logged in user
        dataset_owner: Owner of the labbook if this dataset is linked
        dataset_name: Name of the labbook if this dataset is linked
        cache_location: Absolute path to the file cache (inside the container) for this dataset
        config_file:

    Returns:
        None
    """
    logger = LMLogger.get_logger()

    p = os.getpid()
    try:
        logger.info(
            f"(Job {p}) Starting clean_dataset_file_cache(logged_in_username={logged_in_username},"
            f"dataset_owner={dataset_owner}, dataset_name={dataset_name}")

        im = InventoryManager(config_file=config_file)

        # Check for dataset
        try:
            im.load_dataset(logged_in_username, dataset_owner, dataset_name)
            logger.info(
                f"{logged_in_username}/{dataset_owner}/{dataset_name} still exists. Skipping file cache clean."
            )
            return
        except InventoryException:
            # Dataset not found, move along
            pass

        # Check for submodule references
        for lb in im.list_labbooks(logged_in_username):
            for ds in im.get_linked_datasets(lb):
                if ds.namespace == dataset_owner and ds.name == dataset_name:
                    logger.info(
                        f"{logged_in_username}/{dataset_owner}/{dataset_name} still referenced by {str(lb)}."
                        f" Skipping file cache clean.")
                    return

        # If you get here the dataset no longer exists and is not used by any projects, clear files
        shutil.rmtree(cache_location)

    except Exception as err:
        logger.error(f"(Job {p}) Error in clean_dataset_file_cache job")
        logger.exception(err)
        raise
Ejemplo n.º 11
0
def update_unmanaged_dataset_from_remote(logged_in_username: str,
                                         access_token: str, id_token: str,
                                         dataset_owner: str,
                                         dataset_name: str) -> None:
    """Method to update/populate an unmanaged dataset from its remote automatically

    Args:
        logged_in_username: username for the currently logged in user
        access_token: bearer token
        id_token: identity token
        dataset_owner: Owner of the dataset containing the files to download
        dataset_name: Name of the dataset containing the files to download

    Returns:

    """
    def update_meta(msg):
        job = get_current_job()
        if not job:
            return
        if 'feedback' not in job.meta:
            job.meta['feedback'] = msg
        else:
            job.meta['feedback'] = job.meta['feedback'] + f'\n{msg}'
        job.save_meta()

    logger = LMLogger.get_logger()

    try:
        p = os.getpid()
        logger.info(
            f"(Job {p}) Starting update_unmanaged_dataset_from_remote(logged_in_username={logged_in_username},"
            f"dataset_owner={dataset_owner}, dataset_name={dataset_name}")

        im = InventoryManager()
        ds = im.load_dataset(logged_in_username, dataset_owner, dataset_name)

        ds.namespace = dataset_owner
        ds.backend.set_default_configuration(logged_in_username, access_token,
                                             id_token)

        if not isinstance(ds.backend, UnmanagedStorageBackend):
            raise ValueError("Can only auto-update unmanaged dataset types")

        if not ds.backend.can_update_from_remote:
            raise ValueError(
                "Storage backend cannot update automatically from remote.")

        ds.backend.update_from_remote(ds, update_meta)

    except Exception as err:
        logger.exception(err)
        raise
Ejemplo n.º 12
0
def sync_repository(repository: Repository,
                    username: str,
                    override: MergeOverride,
                    remote: str = "origin",
                    access_token: str = None,
                    pull_only: bool = False,
                    id_token: str = None) -> int:
    p = os.getpid()
    logger = LMLogger.get_logger()
    logger.info(f"(Job {p}) Starting sync_repository({str(repository)})")

    def update_feedback(msg: str,
                        has_failures: Optional[bool] = None,
                        failure_detail: Optional[str] = None,
                        percent_complete: Optional[float] = None):
        """Method to update the job's metadata and provide feedback to the UI"""
        current_job = get_current_job()
        if not current_job:
            return
        if has_failures:
            current_job.meta['has_failures'] = has_failures
        if failure_detail:
            current_job.meta['failure_detail'] = failure_detail
        if percent_complete:
            current_job.meta['percent_complete'] = percent_complete

        current_job.meta['feedback'] = msg
        current_job.save_meta()

    try:
        update_feedback("Sync task in queue")
        with repository.lock():
            if isinstance(repository, LabBook):
                wf = LabbookWorkflow(repository)
            else:
                wf = DatasetWorkflow(repository)  # type: ignore
            cnt = wf.sync(username=username,
                          remote=remote,
                          override=override,
                          feedback_callback=update_feedback,
                          access_token=access_token,
                          id_token=id_token,
                          pull_only=pull_only)
        logger.info(f"(Job {p} Completed sync_repository with cnt={cnt}")
        return cnt
    except MergeConflict as me:
        logger.exception(f"(Job {p}) Merge conflict: {me}")
        raise
    except IOError:
        raise
    except Exception as e:
        logger.exception(e)
        raise Exception("Could not sync - try to log out and log in again.")
Ejemplo n.º 13
0
def export_labbook_as_zip(labbook_path: str, lb_export_directory: str) -> str:
    """Return path to archive file of exported labbook. """
    p = os.getpid()
    logger = LMLogger.get_logger()
    logger.info(f"(Job {p}) Starting export_labbook_as_zip({labbook_path})")

    try:
        lb = InventoryManager().load_labbook_from_directory(labbook_path)
        with lb.lock():
            path = ZipExporter.export_labbook(lb.root_dir, lb_export_directory)
        return path
    except Exception as e:
        logger.exception(f"(Job {p}) Error on export_labbook_as_zip: {e}")
        raise
Ejemplo n.º 14
0
def build_labbook_image(path: str,
                        username: str,
                        tag: Optional[str] = None,
                        nocache: bool = False) -> str:
    """Return a docker image ID of given LabBook.

    Args:
        path: Pass-through arg to labbook root.
        username: Username of active user.
        tag: Pass-through arg to tag of docker image.
        nocache(bool): Pass-through arg to docker build.

    Returns:
        Docker image ID
    """

    logger = LMLogger.get_logger()
    logger.info(
        f"Starting build_labbook_image({path}, {username}, {tag}, {nocache}) in pid {os.getpid()}"
    )

    try:
        job = get_current_job()
        if job:
            job.meta['pid'] = os.getpid()
            job.save_meta()

        def save_metadata_callback(line: str) -> None:
            try:
                if not line:
                    return
                job.meta['feedback'] = (job.meta.get('feedback')
                                        or '') + line + '\n'
                job.save_meta()
            except Exception as e:
                logger.error(e)

        save_metadata_callback("Build task in queue")
        image_id = build_image(path,
                               override_image_tag=tag,
                               nocache=nocache,
                               username=username,
                               feedback_callback=save_metadata_callback)

        logger.info(
            f"Completed build_labbook_image in pid {os.getpid()}: {image_id}")
        return image_id
    except Exception as e:
        logger.error(f"Error on build_labbook_image in pid {os.getpid()}: {e}")
        raise
Ejemplo n.º 15
0
def export_dataset_as_zip(dataset_path: str, ds_export_directory: str) -> str:
    """Return path to archive file of exported dataset. """
    p = os.getpid()
    logger = LMLogger.get_logger()
    logger.info(f"(Job {p}) Starting export_dataset_as_zip({dataset_path})")

    try:
        ds = InventoryManager().load_dataset_from_directory(dataset_path)
        with ds.lock():
            path = ZipExporter.export_dataset(ds.root_dir, ds_export_directory)
        return path
    except Exception as e:
        logger.exception(f"(Job {p}) Error on export_dataset_as_zip: {e}")
        raise
Ejemplo n.º 16
0
def test_sleep(n):
    """Used only for testing -- example method with argument. """
    logger = LMLogger.get_logger()
    logger.info("Starting test_sleep({}) in pid {}".format(n, os.getpid()))

    try:
        job = get_current_job()
        job.meta['sample'] = 'test_sleep metadata'
        job.save_meta()

        time.sleep(n)
        logger.info("Completed test_sleep in pid {}".format(os.getpid()))
        return 0
    except Exception as e:
        logger.error("Error on test_sleep in pid {}: {}".format(os.getpid(), e))
        raise
Ejemplo n.º 17
0
def hash_dataset_files(logged_in_username: str,
                       dataset_owner: str,
                       dataset_name: str,
                       file_list: List,
                       config_file: str = None) -> None:
    """

    Args:
        logged_in_username: username for the currently logged in user
        dataset_owner: Owner of the labbook if this dataset is linked
        dataset_name: Name of the labbook if this dataset is linked
        file_list: List of files to be hashed
        config_file: Optional config file to use

    Returns:
        None
    """
    logger = LMLogger.get_logger()

    p = os.getpid()
    try:
        logger.info(
            f"(Job {p}) Starting hash_dataset_files(logged_in_username={logged_in_username},"
            f"dataset_owner={dataset_owner}, dataset_name={dataset_name}")

        ds = InventoryManager(config_file=config_file).load_dataset(
            logged_in_username, dataset_owner, dataset_name)
        manifest = Manifest(ds, logged_in_username)

        hash_result, fast_hash_result = manifest.hash_files(file_list)

        job = get_current_job()
        if job:
            job.meta['hash_result'] = ",".join(
                ['None' if v is None else v for v in hash_result])
            job.meta['fast_hash_result'] = ",".join(
                ['None' if v is None else v for v in fast_hash_result])
            job.save_meta()

    except Exception as err:
        logger.error(f"(Job {p}) Error in clean_dataset_file_cache job")
        logger.exception(err)
        raise
Ejemplo n.º 18
0
def import_dataset_from_zip(archive_path: str,
                            username: str,
                            owner: str,
                            config_file: Optional[str] = None) -> str:
    """Method to import a dataset from a zip file

    Args:
        archive_path(str): Path to the uploaded zip
        username(str): Username
        owner(str): Owner username
        config_file(str): Optional path to a labmanager config file

    Returns:
        str: directory path of imported labbook
    """
    def update_meta(msg):
        job = get_current_job()
        if not job:
            return
        job.meta['feedback'] = msg
        job.save_meta()

    p = os.getpid()
    logger = LMLogger.get_logger()
    logger.info(
        f"(Job {p}) Starting import_dataset_from_zip(archive_path={archive_path},"
        f"username={username}, owner={owner}, config_file={config_file})")

    try:
        lb = ZipExporter.import_dataset(archive_path,
                                        username,
                                        owner,
                                        config_file=config_file,
                                        update_meta=update_meta)
        return lb.root_dir
    except Exception as e:
        logger.exception(
            f"(Job {p}) Error on import_dataset_from_zip({archive_path}): {e}")
        raise
    finally:
        if os.path.exists(archive_path):
            os.remove(archive_path)
Ejemplo n.º 19
0
def stop_labbook_container(container_id: str):
    """Return a dictionary of metadata pertaining to the given task's Redis key.

    TODO - Take labbook as argument rather than image tag.

    Args:
        image_tag(str): Container to stop

    Returns:
        0 to indicate no failure
    """

    logger = LMLogger.get_logger()
    logger.info(f"Starting stop_labbook_container({container_id}) in pid {os.getpid()}")

    try:
        stop_container(container_id)
        return 0
    except Exception as e:
        logger.error("Error on stop_labbook_container in pid {}: {}".format(os.getpid(), e))
        raise
Ejemplo n.º 20
0
def test_incr(path):
    logger = LMLogger.get_logger()
    logger.info("Starting test_incr({}) in pid {}".format(path, os.getpid()))

    try:
        amt = 1
        if not os.path.exists(path):
            logger.info("Creating {}".format(path))
            with open(path, 'w') as fp:
                json.dump({'amt': amt}, fp)
        else:
            logger.info("Loading {}".format(path))
            with open(path, 'r') as fp:
                amt_dict = json.load(fp)
            logger.info("Amt = {}")
            with open(path, 'w') as fp:
                amt_dict['amt'] = amt_dict['amt'] + 1
                json.dump(amt_dict, fp)
            logger.info("Set amt = {} in {}".format(amt_dict['amt'], path))
    except Exception as e:
        logger.error("Error on test_incr in pid {}: {}".format(os.getpid(), e))
        raise
Ejemplo n.º 21
0
import graphene
from confhttpproxy import ProxyRouter

from gtmcore.inventory.inventory import InventoryManager
from gtmcore.labbook.labbook import LabBook
from gtmcore.exceptions import GigantumException
from gtmcore.container.container import ContainerOperations
from gtmcore.mitmproxy.mitmproxy import MITMProxyOperations
from gtmcore.container.jupyter import check_jupyter_reachable, start_jupyter
from gtmcore.container.rserver import start_rserver
from gtmcore.logging import LMLogger
from gtmcore.activity.services import start_labbook_monitor

from lmsrvcore.auth.user import get_logged_in_username, get_logged_in_author

logger = LMLogger.get_logger()


def unique_id() -> str:
    """This is used to, e.g., identify a specific running labbook.

    This allows us to link things like activity monitors, etc.
    It can safely be improved or changed, as consumers should only expect some "random" string."""
    return uuid.uuid4().hex[:10]


class StartDevTool(graphene.relay.ClientIDMutation):
    class Input:
        owner = graphene.String(required=True)
        labbook_name = graphene.String(required=True)
        dev_tool = graphene.String(required=True)
Ejemplo n.º 22
0
def download_dataset_files(logged_in_username: str, access_token: str, id_token: str,
                           dataset_owner: str, dataset_name: str,
                           labbook_owner: Optional[str] = None, labbook_name: Optional[str] = None,
                           all_keys: Optional[bool] = False, keys: Optional[List[str]] = None):
    """Method to import a dataset from a zip file

    Args:
        logged_in_username: username for the currently logged in user
        access_token: bearer token
        id_token: identity token
        dataset_owner: Owner of the dataset containing the files to download
        dataset_name: Name of the dataset containing the files to download
        labbook_owner: Owner of the labbook if this dataset is linked
        labbook_name: Name of the labbook if this dataset is linked
        all_keys: Boolean indicating if all remaining files should be downloaded
        keys: List if file keys to download

    Returns:
        str: directory path of imported labbook
    """
    def update_meta(msg):
        job = get_current_job()
        if not job:
            return
        if 'feedback' not in job.meta:
            job.meta['feedback'] = msg
        else:
            job.meta['feedback'] = job.meta['feedback'] + f'\n{msg}'
        job.save_meta()

    logger = LMLogger.get_logger()

    try:
        p = os.getpid()
        logger.info(f"(Job {p}) Starting download_dataset_files(logged_in_username={logged_in_username},"
                    f"dataset_owner={dataset_owner}, dataset_name={dataset_name}, labbook_owner={labbook_owner},"
                    f" labbook_name={labbook_name}, all_keys={all_keys}, keys={keys}")

        im = InventoryManager()

        if labbook_owner is not None and labbook_name is not None:
            # This is a linked dataset, load repo from the Project
            lb = im.load_labbook(logged_in_username, labbook_owner, labbook_name)
            dataset_dir = os.path.join(lb.root_dir, '.gigantum', 'datasets', dataset_owner, dataset_name)
            ds = im.load_dataset_from_directory(dataset_dir)
        else:
            # this is a normal dataset. Load repo from working dir
            ds = im.load_dataset(logged_in_username, dataset_owner, dataset_name)

        ds.namespace = dataset_owner
        ds.backend.set_default_configuration(logged_in_username, access_token, id_token)
        m = Manifest(ds, logged_in_username)
        iom = IOManager(ds, m)

        if all_keys:
            result = iom.pull_all(status_update_fn=update_meta)
        elif keys:
            result = iom.pull_objects(keys=keys, status_update_fn=update_meta)
        else:
            raise ValueError("Must provide a list of keys or set all_keys=True")

        # Save the Relay node IDs to the job metadata so the UI can re-fetch as needed
        job = get_current_job()
        if job:
            job.meta['success_keys'] = [x.dataset_path for x in result.success]
            job.meta['failure_keys'] = [x.dataset_path for x in result.failure]
            job.save_meta()

        if len(result.failure) > 0:
            # If any downloads failed, exit non-zero to the UI knows there was an error
            sys.exit(-1)

    except Exception as err:
        logger.exception(err)
        raise
Ejemplo n.º 23
0
def check_and_import_dataset(logged_in_username: str,
                             dataset_owner: str,
                             dataset_name: str,
                             remote_url: str,
                             access_token: Optional[str] = None,
                             config_file: Optional[str] = None) -> None:
    """Job to check if a dataset exists in the user's working directory, and if not import it. This is primarily used
    when importing, syncing, or switching branches on a project with linked datasets

    Args:
        logged_in_username: username for the currently logged in user
        dataset_owner: Owner of the labbook if this dataset is linked
        dataset_name: Name of the labbook if this dataset is linked
        remote_url: URL of the dataset to import if needed
        access_token: The current user's access token, needed to initialize git credentials in certain situations
        config_file: config file (used for test mocking)

    Returns:
        None
    """
    logger = LMLogger.get_logger()

    p = os.getpid()
    try:
        logger.info(
            f"(Job {p}) Starting check_and_import_dataset(logged_in_username={logged_in_username},"
            f"dataset_owner={dataset_owner}, dataset_name={dataset_name}")

        im = InventoryManager(config_file=config_file)

        try:
            # Check for dataset already existing in the user's working directory
            im.load_dataset(logged_in_username, dataset_owner, dataset_name)
            logger.info(
                f"{logged_in_username}/{dataset_owner}/{dataset_name} exists. Skipping auto-import."
            )
            return
        except InventoryException:
            # Dataset not found, import it
            logger.info(
                f"{logged_in_username}/{dataset_owner}/{dataset_name} not found. "
                f"Auto-importing remote dataset from {remote_url}")
            config_obj = Configuration(config_file=config_file)

            if access_token:
                # If the access token is set, git creds should be configured
                remote_parts = urlsplit(remote_url)
                if remote_parts.netloc:
                    remote_target = f"{remote_parts.scheme}://{remote_parts.netloc}/"
                else:
                    remote_target = remote_parts.path

                admin_service = None
                for remote in config_obj.config['git']['remotes']:
                    if remote == remote_target:
                        admin_service = config_obj.config['git']['remotes'][
                            remote]['admin_service']
                        break

                if not admin_service:
                    raise ValueError(
                        f"Failed to configure admin service URL based on target remote: {remote_target}"
                    )

                gl_mgr = GitLabManager(remote_target,
                                       admin_service=admin_service,
                                       access_token=access_token)
                gl_mgr.configure_git_credentials(remote_target,
                                                 logged_in_username)

            gitworkflows_utils.clone_repo(
                remote_url=remote_url,
                username=logged_in_username,
                owner=dataset_owner,
                load_repository=im.load_dataset_from_directory,
                put_repository=im.put_dataset)
            logger.info(
                f"{logged_in_username}/{dataset_owner}/{dataset_name} auto-imported successfully"
            )

    except Exception as err:
        logger.error(f"(Job {p}) Error in check_and_import_dataset job")
        logger.exception(err)
        raise
Ejemplo n.º 24
0
def verify_dataset_contents(logged_in_username: str,
                            access_token: str,
                            id_token: str,
                            dataset_owner: str,
                            dataset_name: str,
                            labbook_owner: Optional[str] = None,
                            labbook_name: Optional[str] = None) -> None:
    """Method to update/populate an unmanaged dataset from it local state

    Args:
        logged_in_username: username for the currently logged in user
        access_token: bearer token
        id_token: identity token
        dataset_owner: Owner of the dataset containing the files to download
        dataset_name: Name of the dataset containing the files to download
        labbook_owner: Owner of the labbook if this dataset is linked
        labbook_name: Name of the labbook if this dataset is linked

    Returns:
        None
    """
    job = get_current_job()

    def update_meta(msg):
        if not job:
            return
        if 'feedback' not in job.meta:
            job.meta['feedback'] = msg
        else:
            job.meta['feedback'] = job.meta['feedback'] + f'\n{msg}'
        job.save_meta()

    logger = LMLogger.get_logger()

    try:
        p = os.getpid()
        logger.info(
            f"(Job {p}) Starting verify_dataset_contents(logged_in_username={logged_in_username},"
            f"dataset_owner={dataset_owner}, dataset_name={dataset_name},"
            f"labbook_owner={labbook_owner}, labbook_name={labbook_name}")

        im = InventoryManager()
        if labbook_owner is not None and labbook_name is not None:
            # This is a linked dataset, load repo from the Project
            lb = im.load_labbook(logged_in_username, labbook_owner,
                                 labbook_name)
            dataset_dir = os.path.join(lb.root_dir, '.gigantum', 'datasets',
                                       dataset_owner, dataset_name)
            ds = im.load_dataset_from_directory(dataset_dir)
        else:
            # this is a normal dataset. Load repo from working dir
            ds = im.load_dataset(logged_in_username, dataset_owner,
                                 dataset_name)

        ds.namespace = dataset_owner
        ds.backend.set_default_configuration(logged_in_username, access_token,
                                             id_token)

        result = ds.backend.verify_contents(ds, update_meta)
        job.meta['modified_keys'] = result

    except Exception as err:
        logger.exception(err)
        raise
Ejemplo n.º 25
0
def push_dataset_objects(objs: List[PushObject],
                         logged_in_username: str,
                         access_token: str,
                         id_token: str,
                         dataset_owner: str,
                         dataset_name: str,
                         config_file: str = None) -> None:
    """Method to pull a collection of objects from a dataset's backend

    Args:
        objs: List if file PushObject to push
        logged_in_username: username for the currently logged in user
        access_token: bearer token
        id_token: identity token
        dataset_owner: Owner of the dataset containing the files to download
        dataset_name: Name of the dataset containing the files to download
        config_file: config file (used for test mocking)

    Returns:
        str: directory path of imported labbook
    """
    logger = LMLogger.get_logger()

    def progress_update_callback(completed_bytes: int) -> None:
        """Method to update the job's metadata and provide feedback to the UI"""
        current_job = get_current_job()
        if not current_job:
            return
        if 'completed_bytes' not in current_job.meta:
            current_job.meta['completed_bytes'] = 0

        current_job.meta['completed_bytes'] = int(
            current_job.meta['completed_bytes']) + completed_bytes
        current_job.save_meta()

    try:
        p = os.getpid()
        logger.info(
            f"(Job {p}) Starting push_dataset_objects(logged_in_username={logged_in_username},"
            f"dataset_owner={dataset_owner}, dataset_name={dataset_name}")

        im = InventoryManager(config_file=config_file)
        ds = im.load_dataset(logged_in_username, dataset_owner, dataset_name)

        ds.namespace = dataset_owner
        ds.backend.set_default_configuration(logged_in_username, access_token,
                                             id_token)
        m = Manifest(ds, logged_in_username)
        iom = IOManager(ds, m)

        result = iom.push_objects(objs,
                                  progress_update_fn=progress_update_callback)

        job = get_current_job()
        if job:
            job.meta['failures'] = ",".join([
                f"{x.object_path}|{x.dataset_path}|{x.revision}"
                for x in result.failure
            ])
            job.meta['message'] = result.message
            job.save_meta()

    except Exception as err:
        logger.exception(err)
        raise
Ejemplo n.º 26
0
def pull_objects(keys: List[str],
                 logged_in_username: str,
                 access_token: str,
                 id_token: str,
                 dataset_owner: str,
                 dataset_name: str,
                 labbook_owner: Optional[str] = None,
                 labbook_name: Optional[str] = None,
                 config_file: str = None) -> None:
    """Method to pull a collection of objects from a dataset's backend.

    This runs the IOManager.pull_objects() method with `link_revision=False`. This is because this job can be run in
    parallel multiple times with different sets of keys. You don't want to link until the very end, which is handled
    in the `download_dataset_files` job, which is what scheduled this job.

    Args:
        keys: List if file keys to download
        logged_in_username: username for the currently logged in user
        access_token: bearer token
        id_token: identity token
        dataset_owner: Owner of the dataset containing the files to download
        dataset_name: Name of the dataset containing the files to download
        labbook_owner: Owner of the labbook if this dataset is linked
        labbook_name: Name of the labbook if this dataset is linked
        config_file: config file (used for test mocking)

    Returns:
        str: directory path of imported labbook
    """
    logger = LMLogger.get_logger()

    def progress_update_callback(completed_bytes: int) -> None:
        """Method to update the job's metadata and provide feedback to the UI"""
        current_job = get_current_job()
        if not current_job:
            return
        if 'completed_bytes' not in current_job.meta:
            current_job.meta['completed_bytes'] = 0

        current_job.meta['completed_bytes'] = int(
            current_job.meta['completed_bytes']) + completed_bytes
        current_job.save_meta()

    try:
        p = os.getpid()
        logger.info(
            f"(Job {p}) Starting pull_objects(logged_in_username={logged_in_username},"
            f"dataset_owner={dataset_owner}, dataset_name={dataset_name}, labbook_owner={labbook_owner},"
            f" labbook_name={labbook_name}")

        im = InventoryManager(config_file=config_file)

        if labbook_owner is not None and labbook_name is not None:
            # This is a linked dataset, load repo from the Project
            lb = im.load_labbook(logged_in_username, labbook_owner,
                                 labbook_name)
            dataset_dir = os.path.join(lb.root_dir, '.gigantum', 'datasets',
                                       dataset_owner, dataset_name)
            ds = im.load_dataset_from_directory(dataset_dir)
        else:
            # this is a normal dataset. Load repo from working dir
            ds = im.load_dataset(logged_in_username, dataset_owner,
                                 dataset_name)

        ds.namespace = dataset_owner
        ds.backend.set_default_configuration(logged_in_username, access_token,
                                             id_token)
        m = Manifest(ds, logged_in_username)
        iom = IOManager(ds, m)

        result = iom.pull_objects(keys=keys,
                                  progress_update_fn=progress_update_callback,
                                  link_revision=False)

        job = get_current_job()
        if job:
            job.meta['failure_keys'] = ",".join(
                [x.dataset_path for x in result.failure])
            job.meta['message'] = result.message
            job.save_meta()

    except Exception as err:
        logger.exception(err)
        raise
Ejemplo n.º 27
0
def download_dataset_files(logged_in_username: str,
                           access_token: str,
                           id_token: str,
                           dataset_owner: str,
                           dataset_name: str,
                           labbook_owner: Optional[str] = None,
                           labbook_name: Optional[str] = None,
                           all_keys: Optional[bool] = False,
                           keys: Optional[List[str]] = None,
                           config_file: str = None) -> None:
    """Method to download files from a dataset in the background and provide status to the UI.

    This job schedules `pull_objects` jobs after splitting up the download work into batches. At the end, the job
    removes any partially downloaded files (due to failures) and links all the files for the dataset.

    Args:
        logged_in_username: username for the currently logged in user
        access_token: bearer token
        id_token: identity token
        dataset_owner: Owner of the dataset containing the files to download
        dataset_name: Name of the dataset containing the files to download
        labbook_owner: Owner of the labbook if this dataset is linked
        labbook_name: Name of the labbook if this dataset is linked
        all_keys: Boolean indicating if all remaining files should be downloaded
        keys: List if file keys to download
        config_file: config file (used for test mocking)

    Returns:
        str: directory path of imported labbook
    """
    dispatcher_obj = Dispatcher()

    def update_feedback(msg: str,
                        has_failures: Optional[bool] = None,
                        failure_detail: Optional[str] = None,
                        percent_complete: Optional[float] = None) -> None:
        """Method to update the job's metadata and provide feedback to the UI"""
        current_job = get_current_job()
        if not current_job:
            return
        if has_failures:
            current_job.meta['has_failures'] = has_failures
        if failure_detail:
            current_job.meta['failure_detail'] = failure_detail
        if percent_complete:
            current_job.meta['percent_complete'] = percent_complete

        current_job.meta['feedback'] = msg
        current_job.save_meta()

    logger = LMLogger.get_logger()

    try:
        p = os.getpid()
        logger.info(
            f"(Job {p}) Starting download_dataset_files(logged_in_username={logged_in_username},"
            f" dataset_owner={dataset_owner}, dataset_name={dataset_name}, labbook_owner={labbook_owner},"
            f" labbook_name={labbook_name}, all_keys={all_keys}, keys={keys}")

        im = InventoryManager(config_file=config_file)

        if labbook_owner is not None and labbook_name is not None:
            # This is a linked dataset, load repo from the Project
            lb = im.load_labbook(logged_in_username, labbook_owner,
                                 labbook_name)
            dataset_dir = os.path.join(lb.root_dir, '.gigantum', 'datasets',
                                       dataset_owner, dataset_name)
            ds = im.load_dataset_from_directory(dataset_dir)
        else:
            # this is a normal dataset. Load repo from working dir
            ds = im.load_dataset(logged_in_username, dataset_owner,
                                 dataset_name)

        ds.namespace = dataset_owner
        ds.backend.set_default_configuration(logged_in_username, access_token,
                                             id_token)
        m = Manifest(ds, logged_in_username)
        iom = IOManager(ds, m)

        key_batches, total_bytes, num_files = iom.compute_pull_batches(
            keys, pull_all=all_keys)

        failure_keys = list()
        if key_batches:
            # Schedule jobs for batches
            bg_jobs = list()
            for keys in key_batches:
                job_kwargs = {
                    'keys': keys,
                    'logged_in_username': logged_in_username,
                    'access_token': access_token,
                    'id_token': id_token,
                    'dataset_owner': dataset_owner,
                    'dataset_name': dataset_name,
                    'labbook_owner': labbook_owner,
                    'labbook_name': labbook_name,
                    'config_file': config_file,
                }
                job_metadata = {
                    'dataset':
                    f"{logged_in_username}|{dataset_owner}|{dataset_name}",
                    'method': 'pull_objects'
                }

                job_key = dispatcher_obj.dispatch_task(
                    method_reference=pull_objects,
                    kwargs=job_kwargs,
                    metadata=job_metadata,
                    persist=True)
                bg_jobs.append(
                    BackgroundDownloadJob(dispatcher_obj, keys, job_key))

            update_feedback(
                f"Please wait - Downloading {num_files} files ({format_size(total_bytes)}) - 0% complete",
                percent_complete=0,
                has_failures=False)
            logger.info(
                f"(Job {p}) Starting file downloads for"
                f" {logged_in_username}/{dataset_owner}/{dataset_name} with {len(key_batches)} jobs"
            )

            while sum([(x.is_complete or x.is_failed)
                       for x in bg_jobs]) != len(bg_jobs):
                # Refresh all job statuses and update status feedback
                [j.refresh_status() for j in bg_jobs]
                total_completed_bytes = sum(
                    [j.completed_bytes for j in bg_jobs])
                pc = (float(total_completed_bytes) / float(total_bytes)) * 100
                update_feedback(
                    f"Please wait - Downloading {num_files} files ({format_size(total_completed_bytes)} of "
                    f"{format_size(total_bytes)}) - {round(pc)}% complete",
                    percent_complete=pc)
                time.sleep(1)

            # Aggregate failures if they exist
            for j in bg_jobs:
                if j.is_failed:
                    # Whole job failed...assume entire batch should get re-uploaded for now
                    failure_keys.extend(j.keys)
                else:
                    failure_keys.extend(j.get_failed_keys())

        # Set final status for UI
        if len(failure_keys) == 0:
            update_feedback(f"Download complete!",
                            percent_complete=100,
                            has_failures=False)
        else:
            failure_str = ""
            for f in failure_keys:
                # If any failed files partially downloaded, remove them.
                abs_dataset_path = os.path.join(m.current_revision_dir, f)
                abs_object_path = m.dataset_to_object_path(f)
                if os.path.exists(abs_dataset_path):
                    os.remove(abs_dataset_path)
                if os.path.exists(abs_object_path):
                    os.remove(abs_object_path)
                failure_str = f"{failure_str}{f}\n"

            failure_detail_str = f"Files that failed to download:\n{failure_str}"
            update_feedback("",
                            has_failures=True,
                            failure_detail=failure_detail_str)

        # Link dataset files, so anything that was successfully pulled will materialize
        m.link_revision()

        if len(failure_keys) > 0:
            # If any downloads failed, exit non-zero to the UI knows there was an error
            raise IOError(
                f"{len(failure_keys)} file(s) failed to download. Check message detail and try again."
            )

    except Exception as err:
        logger.exception(err)
        raise
Ejemplo n.º 28
0
def complete_dataset_upload_transaction(logged_in_username: str,
                                        logged_in_email: str,
                                        dataset_owner: str,
                                        dataset_name: str,
                                        dispatcher,
                                        config_file: str = None) -> None:
    """Method to import a dataset from a zip file

    Args:
        logged_in_username: username for the currently logged in user
        logged_in_email: email for the currently logged in user
        dataset_owner: Owner of the labbook if this dataset is linked
        dataset_name: Name of the labbook if this dataset is linked
        dispatcher: Reference to the dispatcher CLASS
        config_file: config file (used for test mocking)

    Returns:
        None
    """
    logger = LMLogger.get_logger()
    dispatcher_obj = dispatcher()

    def update_feedback(msg: str,
                        has_failures: Optional[bool] = None,
                        failure_detail: Optional[str] = None,
                        percent_complete: Optional[float] = None):
        """Method to update the job's metadata and provide feedback to the UI"""
        current_job = get_current_job()
        if not current_job:
            return
        if has_failures:
            current_job.meta['has_failures'] = has_failures
        if failure_detail:
            current_job.meta['failure_detail'] = failure_detail
        if percent_complete:
            current_job.meta['percent_complete'] = percent_complete

        current_job.meta['feedback'] = msg
        current_job.save_meta()

    def schedule_bg_hash_job():
        """Method to check if a bg job should get scheduled and do so"""
        num_cores = manifest.get_num_hashing_cpus()
        if sum([x.is_running for x in job_list]) < num_cores:
            for j in job_list:
                if j.is_failed is True and j.failure_count < 3:
                    # Re-schedule failed job
                    job_kwargs['file_list'] = j.file_list
                    job_key = dispatcher_obj.dispatch_task(
                        hash_dataset_files,
                        kwargs=job_kwargs,
                        metadata=job_metadata)
                    j.job_key = job_key
                    update_feedback(
                        f"Restarted failed file hashing job. Re-processing"
                        f" {format_size(j.total_bytes)}...")
                    logger.info(
                        f"(Job {p}) Restarted file hash job for"
                        f" {logged_in_username}/{dataset_owner}/{dataset_name}"
                    )
                    break

                if j.is_complete is False and j.is_running is False:
                    # Schedule new job
                    job_kwargs['file_list'] = j.file_list
                    job_key = dispatcher_obj.dispatch_task(
                        hash_dataset_files,
                        kwargs=job_kwargs,
                        metadata=job_metadata)
                    j.job_key = job_key
                    logger.info(
                        f"(Job {p}) Scheduled file hash job for"
                        f" {logged_in_username}/{dataset_owner}/{dataset_name}"
                    )
                    break

    p = os.getpid()
    try:
        logger.info(
            f"(Job {p}) Starting complete_dataset_upload_transaction(logged_in_username={logged_in_username},"
            f"dataset_owner={dataset_owner}, dataset_name={dataset_name}")

        author = GitAuthor(name=logged_in_username, email=logged_in_email)
        dispatcher_obj = Dispatcher()
        ds = InventoryManager(config_file=config_file).load_dataset(
            logged_in_username, dataset_owner, dataset_name, author=author)
        manifest = Manifest(ds, logged_in_username)

        with ds.lock():
            # Detect changes
            status = manifest.status()

            # Collect filenames that need to be hashed
            filenames = copy.deepcopy(status.modified)
            filenames.extend(status.created)

            # If there are new/updated files, spread work across cores while providing reasonable feedback
            if filenames:
                job_list = generate_bg_hash_job_list(filenames, manifest,
                                                     dispatcher_obj)
                total_bytes = sum([x.total_bytes for x in job_list])

                job_kwargs = {
                    'logged_in_username': logged_in_username,
                    'dataset_owner': dataset_owner,
                    'dataset_name': dataset_name,
                    'file_list': list(),
                    'config_file': config_file,
                }
                job_metadata = {
                    'dataset':
                    f"{logged_in_username}|{dataset_owner}|{dataset_name}",
                    'method': 'hash_dataset_files'
                }

                update_feedback(
                    f"Please wait while file contents are analyzed. "
                    f"Processing {format_size(total_bytes)}...",
                    has_failures=False)
                logger.info(
                    f"(Job {p}) Starting file hash processing for"
                    f" {logged_in_username}/{dataset_owner}/{dataset_name} with {len(job_list)} jobs"
                )

                while True:
                    # Check if you need to schedule jobs and schedule up to 1 job per iteration
                    schedule_bg_hash_job()

                    # Refresh all job statuses and update status feedback
                    completed_job_status = [
                        x.refresh_status() for x in job_list
                    ]
                    completed_bytes = sum([
                        s.total_bytes
                        for s, c in zip(job_list, completed_job_status)
                        if c is True
                    ])
                    update_feedback(
                        f"Please wait while file contents are analyzed. "
                        f"{format_size(completed_bytes)} of {format_size(total_bytes)} complete...",
                        percent_complete=(float(completed_bytes) /
                                          float(total_bytes)) * 100)

                    # Check if you are done
                    completed_or_failed = sum([(x.is_complete
                                                or (x.failure_count >= 3))
                                               for x in job_list])
                    if completed_or_failed == len(job_list):
                        break

                    # Update once per second
                    time.sleep(1)

                # Manually complete update process for updated/created files
                failed_files = list()
                for job in job_list:
                    if job.is_complete:
                        for f, h, fh in zip(job.file_list,
                                            job.get_hash_result(),
                                            job.get_fast_hash_result()):
                            if not fh or not h:
                                failed_files.append(f)
                                continue

                            _, file_bytes, mtime = fh.split("||")
                            manifest._manifest_io.add_or_update(
                                f, h, mtime, file_bytes)
                    else:
                        failed_files.extend(job.file_list)

                # Message for hard failures
                if failed_files:
                    detail_msg = f"The following files failed to hash. Try re-uploading the files again:\n"
                    detail_file_list = " \n".join(failed_files)
                    detail_msg = f"{detail_msg}{detail_file_list}"
                    update_feedback(
                        f"An error occurred while processing some files. Check details and re-upload.",
                        has_failures=True,
                        failure_detail=detail_msg)

            if status.deleted:
                manifest.hasher.delete_fast_hashes(status.deleted)
                for relative_path in status.deleted:
                    manifest._manifest_io.remove(relative_path)

            manifest._manifest_io.persist()

            # Complete sweep operation
            manifest.sweep_all_changes(status=status, upload=True)

    except Exception as err:
        logger.error(f"(Job {p}) Error in clean_dataset_file_cache job")
        logger.exception(err)
        raise