예제 #1
0
def task(work_dir, state_path, random_file_path):
    """
    Runs the end-to-end workflow of the Dependency Manager.
    Note: ProcessPoolExecutor must serialize everything before sending it to the worker,
          so this function needs to be defined at the top-level.
    # """
    # Mock Bundle Service to return a random file object
    mock_bundle_service = MagicMock()
    mock_bundle_service.get_bundle_info = MagicMock(
        return_value={'type': "file"})
    file_obj = open(random_file_path, "rb")
    mock_bundle_service.get_bundle_contents = MagicMock(return_value=file_obj)

    # Create and start a dependency manager
    process_id = os.getpid()
    print(f"{process_id}: Starting a DependencyManager...")
    dependency_manager = DependencyManager(
        commit_file=state_path,
        bundle_service=mock_bundle_service,
        worker_dir=work_dir,
        max_cache_size_bytes=2048,
        download_dependencies_max_retries=1,
    )
    dependency_manager.start()
    print(f"{process_id}: Started with work directory: {work_dir}.")

    # Register a run's UUID as a dependent of a parent bundle with UUID 0x1
    dependency_key = DependencyKey(parent_uuid="0x1", parent_path="parent")
    run_uuid = f"0x{process_id}"
    state = dependency_manager.get(run_uuid, dependency_key)
    assert (run_uuid in state.dependents
            ), f"{process_id}: Expected {run_uuid} as one of the dependents."

    # Release the run bundle as a dependent
    dependency_manager.release(run_uuid, dependency_key)
    dependencies = dependency_manager._fetch_dependencies()
    if dependency_key in dependencies:
        state = dependencies[dependency_key]
        print(f"{process_id}: Checking {run_uuid} in {state.dependents}")
        assert (
            run_uuid not in state.dependents
        ), f"{process_id}: Dependent should not be in the list of dependents after unregistering."

    # Keep the dependency manager running for some time to test the loop
    time.sleep(30)

    # Stop the Dependency Manager
    print(f"{process_id}: Stopping DependencyManger...")
    dependency_manager.stop()
    print(f"{process_id}: Done.")
예제 #2
0
    def setUp(self):
        if module_failed:
            self.skipTest('Issue with ratarmountcore.')

        self.work_dir = tempfile.mkdtemp()
        self.state_path = os.path.join(self.work_dir,
                                       "dependencies-state.json")
        self.dependency_manager = DependencyManager(
            commit_file=self.state_path,
            bundle_service=None,
            worker_dir=self.work_dir,
            max_cache_size_bytes=1024,
            download_dependencies_max_retries=1,
        )
예제 #3
0
def main():
    args = parse_args()
    # This quits if connection unsuccessful
    bundle_service = connect_to_codalab_server(args.server, args.password_file)
    # Configure logging
    logging.basicConfig(
        format='%(asctime)s %(message)s',
        level=(logging.DEBUG if args.verbose else logging.INFO))
    if args.shared_file_system:
        # No need to store bundles locally if filesystems are shared
        local_bundles_dir = None
        # Also no need to download dependencies if they're on the filesystem already
        dependency_manager = None
    else:
        local_bundles_dir = os.path.join(args.work_dir, 'runs')
        dependency_manager = DependencyManager(
            os.path.join(args.work_dir, 'dependencies-state.json'),
            bundle_service,
            args.work_dir,
            args.max_work_dir_size,
        )
    # Set up local directories
    if not os.path.exists(args.work_dir):
        logging.debug('Work dir %s doesn\'t exist, creating.', args.work_dir)
        os.makedirs(args.work_dir, 0o770)
    if local_bundles_dir and not os.path.exists(local_bundles_dir):
        logger.info('%s doesn\'t exist, creating.', local_bundles_dir)
        os.makedirs(local_bundles_dir, 0o770)

    docker_runtime = docker_utils.get_available_runtime()
    image_manager = DockerImageManager(
        os.path.join(args.work_dir, 'images-state.json'),
        args.max_image_cache_size,
        args.max_image_size,
    )

    worker = Worker(
        image_manager,
        dependency_manager,
        os.path.join(args.work_dir, 'worker-state.json'),
        args.cpuset,
        args.gpuset,
        args.id,
        args.tag,
        args.work_dir,
        local_bundles_dir,
        args.exit_when_idle,
        args.idle_seconds,
        bundle_service,
        args.shared_file_system,
        docker_runtime=docker_runtime,
        docker_network_prefix=args.network_prefix,
    )

    # Register a signal handler to ensure safe shutdown.
    for sig in [signal.SIGTERM, signal.SIGINT, signal.SIGHUP]:
        signal.signal(sig, lambda signup, frame: worker.signal())

    # BEGIN: DO NOT CHANGE THIS LINE UNLESS YOU KNOW WHAT YOU ARE DOING
    # THIS IS HERE TO KEEP TEST-CLI FROM HANGING
    logger.info('Worker started!')
    # END

    worker.start()
예제 #4
0
def main():
    args = parse_args()

    if args.tag and not args.tag.replace("-", "").isalnum():
        raise argparse.ArgumentTypeError(
            "Worker tag must only contain letters, numbers or hyphens."
        )

    # Configure logging
    log_format: str = '%(asctime)s %(message)s'
    if args.verbose:
        log_format += ' %(pathname)s %(lineno)d'
        log_level = logging.DEBUG
    else:
        log_level = logging.INFO
    logging.basicConfig(format=log_format, level=log_level)

    logging.getLogger('urllib3').setLevel(logging.INFO)
    # Initialize sentry logging
    if using_sentry():
        initialize_sentry()

    # This quits if connection unsuccessful
    bundle_service = connect_to_codalab_server(args.server, args.password_file)

    # Load some data into sentry
    if using_sentry():
        load_sentry_data(username=bundle_service._username, **vars(args))

    if args.shared_file_system:
        # No need to store bundles locally if filesystems are shared
        local_bundles_dir = None
        # Also no need to download dependencies if they're on the filesystem already
        dependency_manager = None
    else:
        local_bundles_dir = os.path.join(args.work_dir, 'runs')
        dependency_manager = DependencyManager(
            os.path.join(args.work_dir, 'dependencies-state.json'),
            bundle_service,
            args.work_dir,
            args.max_work_dir_size,
            args.download_dependencies_max_retries,
        )

    if args.container_runtime == "singularity":
        singularity_folder = os.path.join(args.work_dir, 'codalab_singularity_images')
        if not os.path.exists(singularity_folder):
            logger.info(
                'Local singularity image location %s doesn\'t exist, creating.', singularity_folder,
            )
            os.makedirs(singularity_folder, 0o770)
        image_manager = SingularityImageManager(
            args.max_image_size, args.max_image_cache_size, singularity_folder,
        )
        # todo workers with singularity don't work because this is set to none -- handle this
        docker_runtime = None
    else:
        image_manager = DockerImageManager(
            os.path.join(args.work_dir, 'images-state.json'),
            args.max_image_cache_size,
            args.max_image_size,
        )
        docker_runtime = docker_utils.get_available_runtime()
    # Set up local directories
    if not os.path.exists(args.work_dir):
        logging.debug('Work dir %s doesn\'t exist, creating.', args.work_dir)
        os.makedirs(args.work_dir, 0o770)
    if local_bundles_dir and not os.path.exists(local_bundles_dir):
        logger.info('%s doesn\'t exist, creating.', local_bundles_dir)
        os.makedirs(local_bundles_dir, 0o770)

    worker = Worker(
        image_manager,
        dependency_manager,
        # Include the worker ID in the worker state JSON path, so multiple workers
        # sharing the same work directory maintain their own state.
        os.path.join(args.work_dir, f'worker-state-{args.id}.json'),
        args.cpuset,
        args.gpuset,
        args.max_memory,
        args.id,
        args.tag,
        args.work_dir,
        local_bundles_dir,
        args.exit_when_idle,
        args.exit_after_num_runs,
        args.idle_seconds,
        args.checkin_frequency_seconds,
        bundle_service,
        args.shared_file_system,
        args.tag_exclusive,
        args.group,
        docker_runtime=docker_runtime,
        docker_network_prefix=args.network_prefix,
        pass_down_termination=args.pass_down_termination,
        delete_work_dir_on_exit=args.delete_work_dir_on_exit,
        exit_on_exception=args.exit_on_exception,
        shared_memory_size_gb=args.shared_memory_size_gb,
        preemptible=args.preemptible,
    )

    # Register a signal handler to ensure safe shutdown.
    for sig in [signal.SIGTERM, signal.SIGINT, signal.SIGHUP]:
        signal.signal(sig, lambda signup, frame: worker.signal())

    # BEGIN: DO NOT CHANGE THIS LINE UNLESS YOU KNOW WHAT YOU ARE DOING
    # THIS IS HERE TO KEEP TEST-CLI FROM HANGING
    logger.info('Worker started!')
    # END

    worker.start()
예제 #5
0
class DependencyManagerTest(unittest.TestCase):
    def setUp(self):
        if module_failed:
            self.skipTest('Issue with ratarmountcore.')

        self.work_dir = tempfile.mkdtemp()
        self.state_path = os.path.join(self.work_dir,
                                       "dependencies-state.json")
        self.dependency_manager = DependencyManager(
            commit_file=self.state_path,
            bundle_service=None,
            worker_dir=self.work_dir,
            max_cache_size_bytes=1024,
            download_dependencies_max_retries=1,
        )

    def tearDown(self):
        shutil.rmtree(self.work_dir)

    def test_get_has(self):
        dependent_uuid = "0x2"
        dependency_key = DependencyKey(parent_uuid="0x1", parent_path="parent")
        state = self.dependency_manager.get(dependent_uuid, dependency_key)
        self.assertTrue(self.dependency_manager.has(dependency_key))
        self.assertEqual(state.stage, "DOWNLOADING")
        self.assertEqual(state.path, "0x1_parent")
        self.assertEqual(state.dependents, {dependent_uuid})

    def test_release(self):
        dependency_key = DependencyKey(parent_uuid="0x1", parent_path="parent")
        self.dependency_manager.get("0x2", dependency_key)
        state = self.dependency_manager.get("0x3", dependency_key)
        # Passing in the same dependency key with a different dependent, will just add the dependent
        self.assertEqual(state.dependents, {"0x2", "0x3"})

        # Release 0x2 as a dependent
        self.dependency_manager.release("0x2", dependency_key)
        with self.dependency_manager._state_lock:
            dependencies = self.dependency_manager._fetch_dependencies()
        state = dependencies[dependency_key]
        self.assertEqual(state.dependents, {"0x3"})

        # Release 0x3 as a dependent - should be left with no dependents
        self.dependency_manager.release("0x3", dependency_key)
        with self.dependency_manager._state_lock:
            dependencies = self.dependency_manager._fetch_dependencies()
        state = dependencies[dependency_key]
        self.assertEqual(len(state.dependents), 0)

    def test_all_dependencies(self):
        dependency_key = DependencyKey(parent_uuid="0x1", parent_path="parent")
        self.dependency_manager.get("0x2", dependency_key)
        dependency_key = DependencyKey(parent_uuid="0x3",
                                       parent_path="parent2")
        self.dependency_manager.get("0x4", dependency_key)
        dependency_keys = self.dependency_manager.all_dependencies
        self.assertEqual(len(dependency_keys), 2)

    @unittest.skip("Flufl.lock doesn't seem to work on GHA for some reason, "
                   "even though this test passes on other machines.")
    def test_concurrency(self):
        num_of_dependency_managers = 10
        executor = ProcessPoolExecutor(max_workers=num_of_dependency_managers)

        random_file_path = os.path.join(self.work_dir, "random_file")
        with open(random_file_path, "wb") as f:
            f.seek((1024 * 1024 * 1024) - 1)  # 1 GB
            f.write(b"\0")

        futures = [
            executor.submit(task, self.work_dir, self.state_path,
                            random_file_path)
            for _ in range(num_of_dependency_managers)
        ]
        for future in futures:
            print(future.result())
            self.assertIsNone(future.exception())
        executor.shutdown()
예제 #6
0
def main():
    args = parse_args()

    if args.tag and not args.tag.isalnum():
        raise argparse.ArgumentTypeError(
            "Worker tag must be alphanumeric (only contain letters and numbers)."
        )

    # Configure logging
    logging.basicConfig(
        format='%(asctime)s %(message)s %(pathname)s %(lineno)d',
        level=(logging.DEBUG if args.verbose else logging.INFO),
    )

    logging.getLogger('urllib3').setLevel(logging.INFO)
    # Initialize sentry logging
    if using_sentry():
        initialize_sentry()

    # This quits if connection unsuccessful
    bundle_service = connect_to_codalab_server(args.server, args.password_file)

    # Load some data into sentry
    if using_sentry():
        load_sentry_data(username=bundle_service._username, **vars(args))

    if args.shared_file_system:
        # No need to store bundles locally if filesystems are shared
        local_bundles_dir = None
        # Also no need to download dependencies if they're on the filesystem already
        dependency_manager = None
    else:
        local_bundles_dir = os.path.join(args.work_dir, 'runs')
        dependency_manager = DependencyManager(
            os.path.join(args.work_dir, 'dependencies-state.json'),
            bundle_service,
            args.work_dir,
            args.max_work_dir_size,
        )
    # Set up local directories
    if not os.path.exists(args.work_dir):
        logging.debug('Work dir %s doesn\'t exist, creating.', args.work_dir)
        os.makedirs(args.work_dir, 0o770)
    if local_bundles_dir and not os.path.exists(local_bundles_dir):
        logger.info('%s doesn\'t exist, creating.', local_bundles_dir)
        os.makedirs(local_bundles_dir, 0o770)

    docker_runtime = docker_utils.get_available_runtime()
    image_manager = DockerImageManager(
        os.path.join(args.work_dir, 'images-state.json'),
        args.max_image_cache_size,
        args.max_image_size,
    )

    worker = Worker(
        image_manager,
        dependency_manager,
        os.path.join(args.work_dir, 'worker-state.json'),
        args.cpuset,
        args.gpuset,
        args.max_memory,
        args.id,
        args.tag,
        args.work_dir,
        local_bundles_dir,
        args.exit_when_idle,
        args.exit_after_num_runs,
        args.idle_seconds,
        args.checkin_frequency_seconds,
        bundle_service,
        args.shared_file_system,
        args.tag_exclusive,
        args.group,
        docker_runtime=docker_runtime,
        docker_network_prefix=args.network_prefix,
        pass_down_termination=args.pass_down_termination,
        delete_work_dir_on_exit=args.delete_work_dir_on_exit,
        exit_on_exception=args.exit_on_exception,
    )

    # Register a signal handler to ensure safe shutdown.
    for sig in [signal.SIGTERM, signal.SIGINT, signal.SIGHUP]:
        signal.signal(sig, lambda signup, frame: worker.signal())

    # BEGIN: DO NOT CHANGE THIS LINE UNLESS YOU KNOW WHAT YOU ARE DOING
    # THIS IS HERE TO KEEP TEST-CLI FROM HANGING
    logger.info('Worker started!')
    # END

    worker.start()