def task(work_dir, state_path, random_file_path): """ Runs the end-to-end workflow of the Dependency Manager. Note: ProcessPoolExecutor must serialize everything before sending it to the worker, so this function needs to be defined at the top-level. # """ # Mock Bundle Service to return a random file object mock_bundle_service = MagicMock() mock_bundle_service.get_bundle_info = MagicMock( return_value={'type': "file"}) file_obj = open(random_file_path, "rb") mock_bundle_service.get_bundle_contents = MagicMock(return_value=file_obj) # Create and start a dependency manager process_id = os.getpid() print(f"{process_id}: Starting a DependencyManager...") dependency_manager = DependencyManager( commit_file=state_path, bundle_service=mock_bundle_service, worker_dir=work_dir, max_cache_size_bytes=2048, download_dependencies_max_retries=1, ) dependency_manager.start() print(f"{process_id}: Started with work directory: {work_dir}.") # Register a run's UUID as a dependent of a parent bundle with UUID 0x1 dependency_key = DependencyKey(parent_uuid="0x1", parent_path="parent") run_uuid = f"0x{process_id}" state = dependency_manager.get(run_uuid, dependency_key) assert (run_uuid in state.dependents ), f"{process_id}: Expected {run_uuid} as one of the dependents." # Release the run bundle as a dependent dependency_manager.release(run_uuid, dependency_key) dependencies = dependency_manager._fetch_dependencies() if dependency_key in dependencies: state = dependencies[dependency_key] print(f"{process_id}: Checking {run_uuid} in {state.dependents}") assert ( run_uuid not in state.dependents ), f"{process_id}: Dependent should not be in the list of dependents after unregistering." # Keep the dependency manager running for some time to test the loop time.sleep(30) # Stop the Dependency Manager print(f"{process_id}: Stopping DependencyManger...") dependency_manager.stop() print(f"{process_id}: Done.")
def setUp(self): if module_failed: self.skipTest('Issue with ratarmountcore.') self.work_dir = tempfile.mkdtemp() self.state_path = os.path.join(self.work_dir, "dependencies-state.json") self.dependency_manager = DependencyManager( commit_file=self.state_path, bundle_service=None, worker_dir=self.work_dir, max_cache_size_bytes=1024, download_dependencies_max_retries=1, )
def main(): args = parse_args() # This quits if connection unsuccessful bundle_service = connect_to_codalab_server(args.server, args.password_file) # Configure logging logging.basicConfig( format='%(asctime)s %(message)s', level=(logging.DEBUG if args.verbose else logging.INFO)) if args.shared_file_system: # No need to store bundles locally if filesystems are shared local_bundles_dir = None # Also no need to download dependencies if they're on the filesystem already dependency_manager = None else: local_bundles_dir = os.path.join(args.work_dir, 'runs') dependency_manager = DependencyManager( os.path.join(args.work_dir, 'dependencies-state.json'), bundle_service, args.work_dir, args.max_work_dir_size, ) # Set up local directories if not os.path.exists(args.work_dir): logging.debug('Work dir %s doesn\'t exist, creating.', args.work_dir) os.makedirs(args.work_dir, 0o770) if local_bundles_dir and not os.path.exists(local_bundles_dir): logger.info('%s doesn\'t exist, creating.', local_bundles_dir) os.makedirs(local_bundles_dir, 0o770) docker_runtime = docker_utils.get_available_runtime() image_manager = DockerImageManager( os.path.join(args.work_dir, 'images-state.json'), args.max_image_cache_size, args.max_image_size, ) worker = Worker( image_manager, dependency_manager, os.path.join(args.work_dir, 'worker-state.json'), args.cpuset, args.gpuset, args.id, args.tag, args.work_dir, local_bundles_dir, args.exit_when_idle, args.idle_seconds, bundle_service, args.shared_file_system, docker_runtime=docker_runtime, docker_network_prefix=args.network_prefix, ) # Register a signal handler to ensure safe shutdown. for sig in [signal.SIGTERM, signal.SIGINT, signal.SIGHUP]: signal.signal(sig, lambda signup, frame: worker.signal()) # BEGIN: DO NOT CHANGE THIS LINE UNLESS YOU KNOW WHAT YOU ARE DOING # THIS IS HERE TO KEEP TEST-CLI FROM HANGING logger.info('Worker started!') # END worker.start()
def main(): args = parse_args() if args.tag and not args.tag.replace("-", "").isalnum(): raise argparse.ArgumentTypeError( "Worker tag must only contain letters, numbers or hyphens." ) # Configure logging log_format: str = '%(asctime)s %(message)s' if args.verbose: log_format += ' %(pathname)s %(lineno)d' log_level = logging.DEBUG else: log_level = logging.INFO logging.basicConfig(format=log_format, level=log_level) logging.getLogger('urllib3').setLevel(logging.INFO) # Initialize sentry logging if using_sentry(): initialize_sentry() # This quits if connection unsuccessful bundle_service = connect_to_codalab_server(args.server, args.password_file) # Load some data into sentry if using_sentry(): load_sentry_data(username=bundle_service._username, **vars(args)) if args.shared_file_system: # No need to store bundles locally if filesystems are shared local_bundles_dir = None # Also no need to download dependencies if they're on the filesystem already dependency_manager = None else: local_bundles_dir = os.path.join(args.work_dir, 'runs') dependency_manager = DependencyManager( os.path.join(args.work_dir, 'dependencies-state.json'), bundle_service, args.work_dir, args.max_work_dir_size, args.download_dependencies_max_retries, ) if args.container_runtime == "singularity": singularity_folder = os.path.join(args.work_dir, 'codalab_singularity_images') if not os.path.exists(singularity_folder): logger.info( 'Local singularity image location %s doesn\'t exist, creating.', singularity_folder, ) os.makedirs(singularity_folder, 0o770) image_manager = SingularityImageManager( args.max_image_size, args.max_image_cache_size, singularity_folder, ) # todo workers with singularity don't work because this is set to none -- handle this docker_runtime = None else: image_manager = DockerImageManager( os.path.join(args.work_dir, 'images-state.json'), args.max_image_cache_size, args.max_image_size, ) docker_runtime = docker_utils.get_available_runtime() # Set up local directories if not os.path.exists(args.work_dir): logging.debug('Work dir %s doesn\'t exist, creating.', args.work_dir) os.makedirs(args.work_dir, 0o770) if local_bundles_dir and not os.path.exists(local_bundles_dir): logger.info('%s doesn\'t exist, creating.', local_bundles_dir) os.makedirs(local_bundles_dir, 0o770) worker = Worker( image_manager, dependency_manager, # Include the worker ID in the worker state JSON path, so multiple workers # sharing the same work directory maintain their own state. os.path.join(args.work_dir, f'worker-state-{args.id}.json'), args.cpuset, args.gpuset, args.max_memory, args.id, args.tag, args.work_dir, local_bundles_dir, args.exit_when_idle, args.exit_after_num_runs, args.idle_seconds, args.checkin_frequency_seconds, bundle_service, args.shared_file_system, args.tag_exclusive, args.group, docker_runtime=docker_runtime, docker_network_prefix=args.network_prefix, pass_down_termination=args.pass_down_termination, delete_work_dir_on_exit=args.delete_work_dir_on_exit, exit_on_exception=args.exit_on_exception, shared_memory_size_gb=args.shared_memory_size_gb, preemptible=args.preemptible, ) # Register a signal handler to ensure safe shutdown. for sig in [signal.SIGTERM, signal.SIGINT, signal.SIGHUP]: signal.signal(sig, lambda signup, frame: worker.signal()) # BEGIN: DO NOT CHANGE THIS LINE UNLESS YOU KNOW WHAT YOU ARE DOING # THIS IS HERE TO KEEP TEST-CLI FROM HANGING logger.info('Worker started!') # END worker.start()
class DependencyManagerTest(unittest.TestCase): def setUp(self): if module_failed: self.skipTest('Issue with ratarmountcore.') self.work_dir = tempfile.mkdtemp() self.state_path = os.path.join(self.work_dir, "dependencies-state.json") self.dependency_manager = DependencyManager( commit_file=self.state_path, bundle_service=None, worker_dir=self.work_dir, max_cache_size_bytes=1024, download_dependencies_max_retries=1, ) def tearDown(self): shutil.rmtree(self.work_dir) def test_get_has(self): dependent_uuid = "0x2" dependency_key = DependencyKey(parent_uuid="0x1", parent_path="parent") state = self.dependency_manager.get(dependent_uuid, dependency_key) self.assertTrue(self.dependency_manager.has(dependency_key)) self.assertEqual(state.stage, "DOWNLOADING") self.assertEqual(state.path, "0x1_parent") self.assertEqual(state.dependents, {dependent_uuid}) def test_release(self): dependency_key = DependencyKey(parent_uuid="0x1", parent_path="parent") self.dependency_manager.get("0x2", dependency_key) state = self.dependency_manager.get("0x3", dependency_key) # Passing in the same dependency key with a different dependent, will just add the dependent self.assertEqual(state.dependents, {"0x2", "0x3"}) # Release 0x2 as a dependent self.dependency_manager.release("0x2", dependency_key) with self.dependency_manager._state_lock: dependencies = self.dependency_manager._fetch_dependencies() state = dependencies[dependency_key] self.assertEqual(state.dependents, {"0x3"}) # Release 0x3 as a dependent - should be left with no dependents self.dependency_manager.release("0x3", dependency_key) with self.dependency_manager._state_lock: dependencies = self.dependency_manager._fetch_dependencies() state = dependencies[dependency_key] self.assertEqual(len(state.dependents), 0) def test_all_dependencies(self): dependency_key = DependencyKey(parent_uuid="0x1", parent_path="parent") self.dependency_manager.get("0x2", dependency_key) dependency_key = DependencyKey(parent_uuid="0x3", parent_path="parent2") self.dependency_manager.get("0x4", dependency_key) dependency_keys = self.dependency_manager.all_dependencies self.assertEqual(len(dependency_keys), 2) @unittest.skip("Flufl.lock doesn't seem to work on GHA for some reason, " "even though this test passes on other machines.") def test_concurrency(self): num_of_dependency_managers = 10 executor = ProcessPoolExecutor(max_workers=num_of_dependency_managers) random_file_path = os.path.join(self.work_dir, "random_file") with open(random_file_path, "wb") as f: f.seek((1024 * 1024 * 1024) - 1) # 1 GB f.write(b"\0") futures = [ executor.submit(task, self.work_dir, self.state_path, random_file_path) for _ in range(num_of_dependency_managers) ] for future in futures: print(future.result()) self.assertIsNone(future.exception()) executor.shutdown()
def main(): args = parse_args() if args.tag and not args.tag.isalnum(): raise argparse.ArgumentTypeError( "Worker tag must be alphanumeric (only contain letters and numbers)." ) # Configure logging logging.basicConfig( format='%(asctime)s %(message)s %(pathname)s %(lineno)d', level=(logging.DEBUG if args.verbose else logging.INFO), ) logging.getLogger('urllib3').setLevel(logging.INFO) # Initialize sentry logging if using_sentry(): initialize_sentry() # This quits if connection unsuccessful bundle_service = connect_to_codalab_server(args.server, args.password_file) # Load some data into sentry if using_sentry(): load_sentry_data(username=bundle_service._username, **vars(args)) if args.shared_file_system: # No need to store bundles locally if filesystems are shared local_bundles_dir = None # Also no need to download dependencies if they're on the filesystem already dependency_manager = None else: local_bundles_dir = os.path.join(args.work_dir, 'runs') dependency_manager = DependencyManager( os.path.join(args.work_dir, 'dependencies-state.json'), bundle_service, args.work_dir, args.max_work_dir_size, ) # Set up local directories if not os.path.exists(args.work_dir): logging.debug('Work dir %s doesn\'t exist, creating.', args.work_dir) os.makedirs(args.work_dir, 0o770) if local_bundles_dir and not os.path.exists(local_bundles_dir): logger.info('%s doesn\'t exist, creating.', local_bundles_dir) os.makedirs(local_bundles_dir, 0o770) docker_runtime = docker_utils.get_available_runtime() image_manager = DockerImageManager( os.path.join(args.work_dir, 'images-state.json'), args.max_image_cache_size, args.max_image_size, ) worker = Worker( image_manager, dependency_manager, os.path.join(args.work_dir, 'worker-state.json'), args.cpuset, args.gpuset, args.max_memory, args.id, args.tag, args.work_dir, local_bundles_dir, args.exit_when_idle, args.exit_after_num_runs, args.idle_seconds, args.checkin_frequency_seconds, bundle_service, args.shared_file_system, args.tag_exclusive, args.group, docker_runtime=docker_runtime, docker_network_prefix=args.network_prefix, pass_down_termination=args.pass_down_termination, delete_work_dir_on_exit=args.delete_work_dir_on_exit, exit_on_exception=args.exit_on_exception, ) # Register a signal handler to ensure safe shutdown. for sig in [signal.SIGTERM, signal.SIGINT, signal.SIGHUP]: signal.signal(sig, lambda signup, frame: worker.signal()) # BEGIN: DO NOT CHANGE THIS LINE UNLESS YOU KNOW WHAT YOU ARE DOING # THIS IS HERE TO KEEP TEST-CLI FROM HANGING logger.info('Worker started!') # END worker.start()