def start(self): """Return whether we ran anything.""" self.load_state() self.sync_state() self.image_manager.start() if not self.shared_file_system: self.dependency_manager.start() while not self.terminate: try: self.process_runs() self.save_state() self.checkin() self.check_termination() self.save_state() if self.check_idle_stop() or self.check_num_runs_stop(): self.terminate = True else: time.sleep(self.checkin_frequency_seconds) except Exception: self.last_checkin_successful = False if using_sentry(): capture_exception() traceback.print_exc() if self.exit_on_exception: logger.warning( 'Encountered exception, terminating the worker after sleeping for 5 minutes...' ) self.terminate = True # Sleep for 5 minutes time.sleep(5 * 60) else: # Sleep for a long time so we don't keep on failing. # We sleep in 5-second increments to check # if the worker needs to terminate (say, if it's received # a SIGTERM signal). logger.warning( 'Sleeping for 1 hour due to exception...please help me!' ) for _ in range(12 * 60): # We run this here, instead of going through another iteration of the # while loop, to minimize the code that's run---the reason we ended up here # in the first place is because of an exception, so we don't want to # re-trigger that exception. if self.terminate_and_restage: # If self.terminate_and_restage is true, self.check_termination() # restages bundles. We surround this in a try-except block, # so we can still properly terminate and clean up # even if self.check_termination() fails for some reason. try: self.check_termination() except Exception: traceback.print_exc() self.terminate = True if self.terminate: break time.sleep(5) self.cleanup()
def image_availability_state(image_spec, success_message, failure_message): """ Try to get the image specified by image_spec from host machine. Return ImageAvailabilityState. """ try: image = self._docker.images.get(image_spec) digests = image.attrs.get('RepoDigests', [image_spec]) digest = digests[0] if len(digests) > 0 else None new_timestamp = str(time.time()) image.tag(self.CACHE_TAG, tag=new_timestamp) for tag in image.tags: tag_label, timestamp = tag.split(":") # remove any other timestamp but not the current one if tag_label == self.CACHE_TAG and timestamp != new_timestamp: try: self._docker.images.remove(tag) except docker.errors.NotFound as err: # It's possible that we get a 404 not found error here when removing the image, # since another worker on the same system has already done so. We just # ignore this 404, since any extraneous tags will be removed during the next iteration. logger.warning( "Attempted to remove image %s from cache, but image was not found: %s", tag, err, ) return ImageAvailabilityState(digest=digest, stage=DependencyStage.READY, message=success_message) except Exception as ex: if using_sentry(): capture_exception() return ImageAvailabilityState(digest=None, stage=DependencyStage.FAILED, message=failure_message % ex)
def main(): args = parse_args() # Configure logging logging.basicConfig( format='%(asctime)s %(message)s', level=(logging.DEBUG if args.verbose else logging.INFO)) # Initialize sentry logging if using_sentry(): initialize_sentry() # This quits if connection unsuccessful bundle_service = connect_to_codalab_server(args.server, args.password_file) # Load some data into sentry if using_sentry(): load_sentry_data(username=bundle_service._username, **vars(args)) if args.shared_file_system: # No need to store bundles locally if filesystems are shared local_bundles_dir = None # Also no need to download dependencies if they're on the filesystem already dependency_manager = None else: local_bundles_dir = os.path.join(args.work_dir, 'runs') dependency_manager = DependencyManager( os.path.join(args.work_dir, 'dependencies-state.json'), bundle_service, args.work_dir, args.max_work_dir_size, ) # Set up local directories if not os.path.exists(args.work_dir): logging.debug('Work dir %s doesn\'t exist, creating.', args.work_dir) os.makedirs(args.work_dir, 0o770) if local_bundles_dir and not os.path.exists(local_bundles_dir): logger.info('%s doesn\'t exist, creating.', local_bundles_dir) os.makedirs(local_bundles_dir, 0o770) docker_runtime = docker_utils.get_available_runtime() image_manager = DockerImageManager( os.path.join(args.work_dir, 'images-state.json'), args.max_image_cache_size, args.max_image_size, ) worker = Worker( image_manager, dependency_manager, os.path.join(args.work_dir, 'worker-state.json'), args.cpuset, args.gpuset, args.max_memory, args.id, args.tag, args.work_dir, local_bundles_dir, args.exit_when_idle, args.exit_after_num_runs, args.idle_seconds, args.checkin_frequency_seconds, bundle_service, args.shared_file_system, args.tag_exclusive, args.group, docker_runtime=docker_runtime, docker_network_prefix=args.network_prefix, pass_down_termination=args.pass_down_termination, delete_work_dir_on_exit=args.delete_work_dir_on_exit, exit_on_exception=args.exit_on_exception, ) # Register a signal handler to ensure safe shutdown. for sig in [signal.SIGTERM, signal.SIGINT, signal.SIGHUP]: signal.signal(sig, lambda signup, frame: worker.signal()) # BEGIN: DO NOT CHANGE THIS LINE UNLESS YOU KNOW WHAT YOU ARE DOING # THIS IS HERE TO KEEP TEST-CLI FROM HANGING logger.info('Worker started!') # END worker.start()
def start_worker_job(self): image = 'codalab/worker:' + os.environ.get('CODALAB_VERSION', 'latest') worker_id = uuid.uuid4().hex logger.debug('Starting worker %s with image %s', worker_id, image) work_dir_prefix = (self.args.worker_work_dir_prefix if self.args.worker_work_dir_prefix else "/tmp/") # This needs to be a unique directory since Batch jobs may share a host work_dir = os.path.join(work_dir_prefix, 'cl_worker_{}_work_dir'.format(worker_id)) command = self.build_command(worker_id, work_dir) # https://docs.aws.amazon.com/AWSCloudFormation/latest/UserGuide/aws-resource-batch-jobdefinition.html # Need to mount: # - docker.sock to enable us to start docker in docker # - work_dir so that the run bundle's output is visible to the worker job_definition = { 'jobDefinitionName': self.args.job_definition_name, 'type': 'container', 'parameters': {}, 'containerProperties': { 'image': image, 'vcpus': self.args.cpus, 'memory': self.args.memory_mb, 'command': [ "/bin/bash", "-c", "/opt/scripts/detect-ec2-spot-preemption.sh & " + " ".join(quote(arg) for arg in command), ], 'environment': [ { 'name': 'CODALAB_USERNAME', 'value': os.environ.get('CODALAB_USERNAME') }, { 'name': 'CODALAB_PASSWORD', 'value': os.environ.get('CODALAB_PASSWORD') }, ], 'volumes': [ { 'host': { 'sourcePath': '/var/run/docker.sock' }, 'name': 'var_run_docker_sock' }, { 'host': { 'sourcePath': work_dir }, 'name': 'work_dir' }, ], 'mountPoints': [ { 'sourceVolume': 'var_run_docker_sock', 'containerPath': '/var/run/docker.sock', 'readOnly': False, }, { 'sourceVolume': 'work_dir', 'containerPath': work_dir, 'readOnly': False }, ], 'readonlyRootFilesystem': False, 'user': self.args.user, }, 'retryStrategy': { 'attempts': 1 }, } if self.args.gpus: job_definition["containerProperties"]["resourceRequirements"] = [{ "value": str(self.args.gpus), "type": "GPU" }] # Allow worker to directly mount a directory. Note that the worker # needs to be set up a priori with this shared filesystem. if os.environ.get('CODALAB_SHARED_FILE_SYSTEM') == 'true': command.append('--shared-file-system') bundle_mount = os.environ.get('CODALAB_BUNDLE_MOUNT') job_definition['containerProperties']['volumes'].append({ 'host': { 'sourcePath': bundle_mount }, 'name': 'shared_dir' }) job_definition['containerProperties']['mountPoints'].append({ 'sourceVolume': 'shared_dir', 'containerPath': bundle_mount, 'readOnly': False }) if using_sentry(): job_definition["containerProperties"]["environment"].append({ 'name': 'CODALAB_SENTRY_INGEST_URL', 'value': CODALAB_SENTRY_INGEST }) job_definition["containerProperties"]["environment"].append({ 'name': 'CODALAB_SENTRY_ENVIRONMENT', 'value': CODALAB_SENTRY_ENVIRONMENT }) # Create a job definition response = self.batch_client.register_job_definition(**job_definition) logger.info('register_job_definition: %s', response) # Submit the job response = self.batch_client.submit_job( jobName=self.args.job_definition_name, jobQueue=self.args.job_queue, jobDefinition=self.args.job_definition_name, ) logger.info('submit_job: %s', response)
def start_worker_job(self) -> None: worker_image: str = 'codalab/worker:' + os.environ.get('CODALAB_VERSION', 'latest') worker_id: str = uuid.uuid4().hex logger.debug('Starting worker {} with image {}'.format(worker_id, worker_image)) work_dir_prefix: str = ( self.args.worker_work_dir_prefix if self.args.worker_work_dir_prefix else "/tmp/" ) # This needs to be a unique directory since Batch jobs may share a host work_dir: str = os.path.join(work_dir_prefix, 'cl_worker_{}_work_dir'.format(worker_id)) command: List[str] = self.build_command(worker_id, work_dir) task_container_run_options: List[str] = [ '--cpus %d' % self.args.cpus, '--memory %dM' % self.args.memory_mb, '--volume /var/run/docker.sock:/var/run/docker.sock', '--volume %s:%s' % (work_dir, work_dir), '--user %s' % self.args.user, ] if os.environ.get('CODALAB_USERNAME') and os.environ.get('CODALAB_PASSWORD'): task_container_run_options.extend( [ '--env CODALAB_USERNAME=%s' % os.environ.get('CODALAB_USERNAME'), '--env CODALAB_PASSWORD=%s' % os.environ.get('CODALAB_PASSWORD'), ] ) else: raise EnvironmentError( 'Valid credentials need to be set as environment variables: CODALAB_USERNAME and CODALAB_PASSWORD' ) if os.environ.get('CODALAB_SHARED_FILE_SYSTEM') == 'true': # Allow workers to directly mount a directory command.append('--shared-file-system') task_container_run_options.append( '--volume shared_dir:%s' % os.environ.get('CODALAB_BUNDLE_MOUNT') ) # Configure Sentry if using_sentry(): task_container_run_options.append( '--env CODALAB_SENTRY_INGEST_URL=%s' % CODALAB_SENTRY_INGEST ) task_container_run_options.append( '--env CODALAB_SENTRY_ENVIRONMENT=%s' % CODALAB_SENTRY_ENVIRONMENT ) command_line: str = "/bin/bash -c '{}'".format(' '.join(command)) logger.debug("Running the following as an Azure Batch task: {}".format(command_line)) task_id: str = 'cl_worker_{}'.format(worker_id) task: TaskAddParameter = TaskAddParameter( id=task_id, command_line=command_line, container_settings=TaskContainerSettings( image_name=worker_image, container_run_options=' '.join(task_container_run_options) ), output_files=[ OutputFile( file_pattern='../stderr.txt', destination=OutputFileDestination( container=OutputFileBlobContainerDestination( path=task_id, container_url=self.args.log_container_url ) ), upload_options=OutputFileUploadOptions( # Upload worker logs once the task completes upload_condition=OutputFileUploadCondition.task_completion ), ) ], ) try: # Create a task under the Azure Batch job. # Catch request errors to keep the worker manager running. self._batch_client.task.add(self.args.job_id, task) except (ClientRequestError, BatchErrorException) as e: logger.error( 'Batch request to add task {} to job {} failed: {}'.format( task_id, self.args.job_id, str(e) ) )
def main(): args = parse_args() if args.tag and not args.tag.replace("-", "").isalnum(): raise argparse.ArgumentTypeError( "Worker tag must only contain letters, numbers or hyphens." ) # Configure logging log_format: str = '%(asctime)s %(message)s' if args.verbose: log_format += ' %(pathname)s %(lineno)d' log_level = logging.DEBUG else: log_level = logging.INFO logging.basicConfig(format=log_format, level=log_level) logging.getLogger('urllib3').setLevel(logging.INFO) # Initialize sentry logging if using_sentry(): initialize_sentry() # This quits if connection unsuccessful bundle_service = connect_to_codalab_server(args.server, args.password_file) # Load some data into sentry if using_sentry(): load_sentry_data(username=bundle_service._username, **vars(args)) if args.shared_file_system: # No need to store bundles locally if filesystems are shared local_bundles_dir = None # Also no need to download dependencies if they're on the filesystem already dependency_manager = None else: local_bundles_dir = os.path.join(args.work_dir, 'runs') dependency_manager = DependencyManager( os.path.join(args.work_dir, 'dependencies-state.json'), bundle_service, args.work_dir, args.max_work_dir_size, args.download_dependencies_max_retries, ) if args.container_runtime == "singularity": singularity_folder = os.path.join(args.work_dir, 'codalab_singularity_images') if not os.path.exists(singularity_folder): logger.info( 'Local singularity image location %s doesn\'t exist, creating.', singularity_folder, ) os.makedirs(singularity_folder, 0o770) image_manager = SingularityImageManager( args.max_image_size, args.max_image_cache_size, singularity_folder, ) # todo workers with singularity don't work because this is set to none -- handle this docker_runtime = None else: image_manager = DockerImageManager( os.path.join(args.work_dir, 'images-state.json'), args.max_image_cache_size, args.max_image_size, ) docker_runtime = docker_utils.get_available_runtime() # Set up local directories if not os.path.exists(args.work_dir): logging.debug('Work dir %s doesn\'t exist, creating.', args.work_dir) os.makedirs(args.work_dir, 0o770) if local_bundles_dir and not os.path.exists(local_bundles_dir): logger.info('%s doesn\'t exist, creating.', local_bundles_dir) os.makedirs(local_bundles_dir, 0o770) worker = Worker( image_manager, dependency_manager, # Include the worker ID in the worker state JSON path, so multiple workers # sharing the same work directory maintain their own state. os.path.join(args.work_dir, f'worker-state-{args.id}.json'), args.cpuset, args.gpuset, args.max_memory, args.id, args.tag, args.work_dir, local_bundles_dir, args.exit_when_idle, args.exit_after_num_runs, args.idle_seconds, args.checkin_frequency_seconds, bundle_service, args.shared_file_system, args.tag_exclusive, args.group, docker_runtime=docker_runtime, docker_network_prefix=args.network_prefix, pass_down_termination=args.pass_down_termination, delete_work_dir_on_exit=args.delete_work_dir_on_exit, exit_on_exception=args.exit_on_exception, shared_memory_size_gb=args.shared_memory_size_gb, preemptible=args.preemptible, ) # Register a signal handler to ensure safe shutdown. for sig in [signal.SIGTERM, signal.SIGINT, signal.SIGHUP]: signal.signal(sig, lambda signup, frame: worker.signal()) # BEGIN: DO NOT CHANGE THIS LINE UNLESS YOU KNOW WHAT YOU ARE DOING # THIS IS HERE TO KEEP TEST-CLI FROM HANGING logger.info('Worker started!') # END worker.start()