print >> sys.stderr, """ Permissions on password file are too lax. Only the user should be allowed to access the file. On Linux, run: chmod 600 %s""" % args.password_file exit(1) with open(args.password_file) as f: username = f.readline().strip() password = f.readline().strip() else: username = raw_input('Username: '******'%(asctime)s %(message)s', level=logging.DEBUG) max_work_dir_size_bytes = parse_size(args.max_work_dir_size) worker = Worker(args.id, args.tag, args.work_dir, max_work_dir_size_bytes, args.shared_file_system, args.slots, BundleServiceClient(args.server, username, password), DockerClient()) # Register a signal handler to ensure safe shutdown. for sig in [signal.SIGTERM, signal.SIGINT, signal.SIGHUP]: signal.signal(sig, lambda signup, frame: worker.signal()) print 'Worker started.' worker.run()
def main(): parser = argparse.ArgumentParser(description='CodaLab worker.') parser.add_argument('--tag', help='Tag that allows for scheduling runs on specific ' 'workers.') parser.add_argument( '--server', default='https://worksheets.codalab.org', help='URL of the CodaLab server, in the format ' '<http|https>://<hostname>[:<port>] (e.g., https://worksheets.codalab.org)', ) parser.add_argument( '--work-dir', default='codalab-worker-scratch', help='Directory where to store temporary bundle data, ' 'including dependencies and the data from run ' 'bundles.', ) parser.add_argument( '--network-prefix', default='codalab_worker_network', help='Docker network name prefix' ) parser.add_argument( '--cpuset', type=str, metavar='CPUSET_STR', default='ALL', help='Comma-separated list of CPUs in which to allow bundle execution, ' '(e.g., \"0,2,3\", \"1\").', ) parser.add_argument( '--gpuset', type=str, metavar='GPUSET_STR', default='ALL', help='Comma-separated list of GPUs in which to allow bundle execution. ' 'Each GPU can be specified by its index or UUID' '(e.g., \"0,1\", \"1\", \"GPU-62casdfasd-asfas...\"', ) parser.add_argument( '--max-work-dir-size', type=str, metavar='SIZE', default='10g', help='Maximum size of the temporary bundle data ' '(e.g., 3, 3k, 3m, 3g, 3t).', ) parser.add_argument( '--max-image-cache-size', type=str, metavar='SIZE', help='Limit the disk space used to cache Docker images ' 'for worker jobs to the specified amount (e.g. ' '3, 3k, 3m, 3g, 3t). If the limit is exceeded, ' 'the least recently used images are removed first. ' 'Worker will not remove any images if this option ' 'is not specified.', ) parser.add_argument( '--password-file', help='Path to the file containing the username and ' 'password for logging into the bundle service, ' 'each on a separate line. If not specified, the ' 'password is read from standard input.', ) parser.add_argument( '--verbose', action='store_true', help='Whether to output verbose log messages.' ) parser.add_argument( '--exit-when-idle', action='store_true', help='If specified the worker quits if it finds itself with no jobs after a checkin', ) parser.add_argument( '--id', default='%s(%d)' % (socket.gethostname(), os.getpid()), help='Internal use: ID to use for the worker.', ) parser.add_argument( '--shared-file-system', action='store_true', help='Internal use: Whether the file system containing ' 'bundle data is shared between the bundle service ' 'and the worker.', ) args = parser.parse_args() # Get the username and password. logger.info('Connecting to %s' % args.server) if args.password_file: if os.stat(args.password_file).st_mode & (stat.S_IRWXG | stat.S_IRWXO): print >>sys.stderr, """ Permissions on password file are too lax. Only the user should be allowed to access the file. On Linux, run: chmod 600 %s""" % args.password_file exit(1) with open(args.password_file) as f: username = f.readline().strip() password = f.readline().strip() else: username = os.environ.get('CODALAB_USERNAME') if username is None: username = raw_input('Username: '******'CODALAB_PASSWORD') if password is None: password = getpass.getpass() # Set up logging. if args.verbose: logging.basicConfig(format='%(asctime)s %(message)s', level=logging.DEBUG) else: logging.basicConfig(format='%(asctime)s %(message)s', level=logging.INFO) try: bundle_service = BundleServiceClient(args.server, username, password) except BundleAuthException as ex: logger.error('Cannot log into the bundle service. Please check your worker credentials.\n') logger.debug('Auth error: {}'.format(ex)) return max_work_dir_size_bytes = parse_size(args.max_work_dir_size) if args.max_image_cache_size is None: max_images_bytes = None else: max_images_bytes = parse_size(args.max_image_cache_size) if not os.path.exists(args.work_dir): logging.debug('Work dir %s doesn\'t exist, creating.', args.work_dir) os.makedirs(args.work_dir, 0o770) def create_local_run_manager(worker): """ To avoid circular dependencies the Worker initializes takes a RunManager factory to initilize its run manager. This method creates a LocalFilesystem-Docker RunManager which is the default execution architecture Codalab uses """ docker_runtime = docker_utils.get_available_runtime() cpuset = parse_cpuset_args(args.cpuset) gpuset = parse_gpuset_args(args.gpuset) dependency_manager = LocalFileSystemDependencyManager( os.path.join(args.work_dir, 'dependencies-state.json'), bundle_service, args.work_dir, max_work_dir_size_bytes, ) image_manager = DockerImageManager( os.path.join(args.work_dir, 'images-state.json'), max_images_bytes ) return LocalRunManager( worker, image_manager, dependency_manager, os.path.join(args.work_dir, 'run-state.json'), cpuset, gpuset, args.work_dir, docker_runtime=docker_runtime, docker_network_prefix=args.network_prefix, ) worker = Worker( create_local_run_manager, os.path.join(args.work_dir, 'worker-state.json'), args.id, args.tag, args.work_dir, args.exit_when_idle, bundle_service, ) # Register a signal handler to ensure safe shutdown. for sig in [signal.SIGTERM, signal.SIGINT, signal.SIGHUP]: signal.signal(sig, lambda signup, frame: worker.signal()) # BEGIN: DO NOT CHANGE THIS LINE UNLESS YOU KNOW WHAT YOU ARE DOING # THIS IS HERE TO KEEP TEST-CLI FROM HANGING print('Worker started.') # END worker.start()
def main(): parser = argparse.ArgumentParser(description='CodaLab worker.') parser.add_argument('--tag', help='Tag that allows for scheduling runs on specific ' 'workers.') parser.add_argument( '--server', default='https://worksheets.codalab.org', help='URL of the CodaLab server, in the format ' '<http|https>://<hostname>[:<port>] (e.g., https://worksheets.codalab.org)' ) parser.add_argument('--work-dir', default='codalab-worker-scratch', help='Directory where to store temporary bundle data, ' 'including dependencies and the data from run ' 'bundles.') parser.add_argument('--max-work-dir-size', type=str, metavar='SIZE', default='10g', help='Maximum size of the temporary bundle data ' '(e.g., 3, 3k, 3m, 3g, 3t).') parser.add_argument( '--max-image-cache-size', type=str, metavar='SIZE', help='Limit the disk space used to cache Docker images ' 'for worker jobs to the specified amount (e.g. ' '3, 3k, 3m, 3g, 3t). If the limit is exceeded, ' 'the least recently used images are removed first. ' 'Worker will not remove any images if this option ' 'is not specified.') parser.add_argument('--slots', type=int, default=1, help='Number of slots to use for running bundles. ' 'A single bundle takes up a single slot.') parser.add_argument('--password-file', help='Path to the file containing the username and ' 'password for logging into the bundle service, ' 'each on a separate line. If not specified, the ' 'password is read from standard input.') parser.add_argument('--verbose', action='store_true', help='Whether to output verbose log messages.') parser.add_argument('--id', default='%s(%d)' % (socket.gethostname(), os.getpid()), help='Internal use: ID to use for the worker.') parser.add_argument( '--shared-file-system', action='store_true', help='Internal use: Whether the file system containing ' 'bundle data is shared between the bundle service ' 'and the worker.') args = parser.parse_args() # Get the username and password. logger.info('Connecting to %s' % args.server) if args.password_file: if os.stat(args.password_file).st_mode & (stat.S_IRWXG | stat.S_IRWXO): print >> sys.stderr, """ Permissions on password file are too lax. Only the user should be allowed to access the file. On Linux, run: chmod 600 %s""" % args.password_file exit(1) with open(args.password_file) as f: username = f.readline().strip() password = f.readline().strip() else: username = os.environ.get('CODALAB_USERNAME') if username is None: username = raw_input('Username: '******'CODALAB_PASSWORD') if password is None: password = getpass.getpass() # Set up logging. if args.verbose: logging.basicConfig(format='%(asctime)s %(message)s', level=logging.DEBUG) else: logging.basicConfig(format='%(asctime)s %(message)s', level=logging.INFO) max_work_dir_size_bytes = parse_size(args.max_work_dir_size) if args.max_image_cache_size is None: max_images_bytes = None else: max_images_bytes = parse_size(args.max_image_cache_size) worker = Worker(args.id, args.tag, args.work_dir, max_work_dir_size_bytes, max_images_bytes, args.shared_file_system, args.slots, BundleServiceClient(args.server, username, password), DockerClient()) # Register a signal handler to ensure safe shutdown. for sig in [signal.SIGTERM, signal.SIGINT, signal.SIGHUP]: signal.signal(sig, lambda signup, frame: worker.signal()) logger.info('Worker started.') worker.run()
def main(): parser = argparse.ArgumentParser(description='CodaLab worker.') parser.add_argument('--tag', help='Tag that allows for scheduling runs on specific ' 'workers.') parser.add_argument( '--server', default='https://worksheets.codalab.org', help='URL of the CodaLab server, in the format ' '<http|https>://<hostname>[:<port>] (e.g., https://worksheets.codalab.org)', ) parser.add_argument( '--work-dir', default='codalab-worker-scratch', help='Directory where to store temporary bundle data, ' 'including dependencies and the data from run ' 'bundles.', ) parser.add_argument( '--network-prefix', default='codalab_worker_network', help='Docker network name prefix' ) parser.add_argument( '--cpuset', type=str, metavar='CPUSET_STR', default='ALL', help='Comma-separated list of CPUs in which to allow bundle execution, ' '(e.g., \"0,2,3\", \"1\").', ) parser.add_argument( '--gpuset', type=str, metavar='GPUSET_STR', default='ALL', help='Comma-separated list of GPUs in which to allow bundle execution. ' 'Each GPU can be specified by its index or UUID' '(e.g., \"0,1\", \"1\", \"GPU-62casdfasd-asfas...\"', ) parser.add_argument( '--max-work-dir-size', type=str, metavar='SIZE', default='10g', help='Maximum size of the temporary bundle data ' '(e.g., 3, 3k, 3m, 3g, 3t).', ) parser.add_argument( '--max-image-cache-size', type=str, metavar='SIZE', help='Limit the disk space used to cache Docker images ' 'for worker jobs to the specified amount (e.g. ' '3, 3k, 3m, 3g, 3t). If the limit is exceeded, ' 'the least recently used images are removed first. ' 'Worker will not remove any images if this option ' 'is not specified.', ) parser.add_argument( '--password-file', help='Path to the file containing the username and ' 'password for logging into the bundle service, ' 'each on a separate line. If not specified, the ' 'password is read from standard input.', ) parser.add_argument( '--verbose', action='store_true', help='Whether to output verbose log messages.' ) parser.add_argument( '--exit-when-idle', action='store_true', help='If specified the worker quits if it finds itself with no jobs after a checkin', ) parser.add_argument( '--id', default='%s(%d)' % (socket.gethostname(), os.getpid()), help='Internal use: ID to use for the worker.', ) parser.add_argument( '--shared-file-system', action='store_true', help='Internal use: Whether the file system containing ' 'bundle data is shared between the bundle service ' 'and the worker.', ) args = parser.parse_args() # Get the username and password. logger.info('Connecting to %s' % args.server) if args.password_file: if os.stat(args.password_file).st_mode & (stat.S_IRWXG | stat.S_IRWXO): print >>sys.stderr, """ Permissions on password file are too lax. Only the user should be allowed to access the file. On Linux, run: chmod 600 %s""" % args.password_file exit(1) with open(args.password_file) as f: username = f.readline().strip() password = f.readline().strip() else: username = os.environ.get('CODALAB_USERNAME') if username is None: username = raw_input('Username: '******'CODALAB_PASSWORD') if password is None: password = getpass.getpass() # Set up logging. if args.verbose: logging.basicConfig(format='%(asctime)s %(message)s', level=logging.DEBUG) else: logging.basicConfig(format='%(asctime)s %(message)s', level=logging.INFO) try: bundle_service = BundleServiceClient(args.server, username, password) except BundleAuthException as ex: logger.error('Cannot log into the bundle service. Please check your worker credentials.\n') logger.debug('Auth error: {}'.format(ex)) return max_work_dir_size_bytes = parse_size(args.max_work_dir_size) if args.max_image_cache_size is None: max_images_bytes = None else: max_images_bytes = parse_size(args.max_image_cache_size) if not os.path.exists(args.work_dir): logging.debug('Work dir %s doesn\'t exist, creating.', args.work_dir) os.makedirs(args.work_dir, 0o770) def create_local_run_manager(worker): """ To avoid circular dependencies the Worker initializes takes a RunManager factory to initilize its run manager. This method creates a LocalFilesystem-Docker RunManager which is the default execution architecture Codalab uses """ docker_runtime = docker_utils.get_available_runtime() cpuset = parse_cpuset_args(args.cpuset) gpuset = parse_gpuset_args(args.gpuset) dependency_manager = LocalFileSystemDependencyManager( os.path.join(args.work_dir, 'dependencies-state.json'), bundle_service, args.work_dir, max_work_dir_size_bytes, ) image_manager = DockerImageManager( os.path.join(args.work_dir, 'images-state.json'), max_images_bytes ) return LocalRunManager( worker, image_manager, dependency_manager, os.path.join(args.work_dir, 'run-state.json'), cpuset, gpuset, args.work_dir, docker_runtime=docker_runtime, docker_network_prefix=args.network_prefix, ) worker = Worker( create_local_run_manager, os.path.join(args.work_dir, 'worker-state.json'), args.id, args.tag, args.work_dir, args.exit_when_idle, bundle_service, ) # Register a signal handler to ensure safe shutdown. for sig in [signal.SIGTERM, signal.SIGINT, signal.SIGHUP]: signal.signal(sig, lambda signup, frame: worker.signal()) # BEGIN: DO NOT CHANGE THIS LINE UNLESS YOU KNOW WHAT YOU ARE DOING # THIS IS HERE TO KEEP TEST-CLI FROM HANGING logger.info('Worker started!') # END worker.start()
def start_bundle_container( bundle_path, uuid, dependencies, command, docker_image, network=None, cpuset=None, gpuset=None, memory_bytes=0, detach=True, tty=False, runtime=DEFAULT_RUNTIME, ): # Impose a minimum container request memory 4mb, same as docker's minimum allowed value # https://docs.docker.com/config/containers/resource_constraints/#limit-a-containers-access-to-memory # When using the REST api, it is allowed to set Memory to 0 but that means the container has unbounded # access to the host machine's memory, which we have decided to not allow if memory_bytes < parse_size('4m'): raise DockerException('Minimum memory must be 4m ({} bytes)'.format(parse_size('4m'))) if not command.endswith(';'): command = '{};'.format(command) docker_command = ['bash', '-c', '( %s ) >stdout 2>stderr' % command] docker_bundle_path = '/' + uuid volumes = get_bundle_container_volume_binds(bundle_path, docker_bundle_path, dependencies) environment = {'HOME': docker_bundle_path} working_dir = docker_bundle_path # Unset entrypoint regardless of image entrypoint = '' cpuset_str = ','.join(cpuset) if cpuset else '' # Get user/group that owns the bundle directory # Then we can ensure that any created files are owned by the user/group # that owns the bundle directory, not root. bundle_stat = os.stat(bundle_path) uid = bundle_stat.st_uid gid = bundle_stat.st_gid # TODO: Fix potential permissions issues arising from this setting # This can cause problems if users expect to run as a specific user user = '******' % (uid, gid) if runtime == NVIDIA_RUNTIME: # nvidia-docker runtime uses this env variable to allocate GPUs environment['NVIDIA_VISIBLE_DEVICES'] = ','.join(gpuset) if gpuset else 'all' container = client.containers.run( image=docker_image, command=docker_command, network=network, mem_limit=memory_bytes, cpuset_cpus=cpuset_str, environment=environment, working_dir=working_dir, entrypoint=entrypoint, volumes=volumes, user=user, detach=detach, runtime=runtime, tty=tty, stdin_open=tty, ) logger.debug('Started Docker container for UUID %s, container ID %s,', uuid, container.id) return container
def main(): parser = argparse.ArgumentParser(description='CodaLab worker.') parser.add_argument('--tag', help='Tag that allows for scheduling runs on specific ' 'workers.') parser.add_argument('--server', default='https://worksheets.codalab.org', help='URL of the CodaLab server, in the format ' '<http|https>://<hostname>[:<port>] (e.g., https://worksheets.codalab.org)') parser.add_argument('--work-dir', default='codalab-worker-scratch', help='Directory where to store temporary bundle data, ' 'including dependencies and the data from run ' 'bundles.') parser.add_argument('--network-prefix', default='codalab_worker_network', help='Docker network name prefix') parser.add_argument('--cpuset', type=str, metavar='CPUSET_STR', default='ALL', help='Comma-separated list of CPUs in which to allow bundle execution, ' '(e.g., \"0,2,3\", \"1\").') parser.add_argument('--gpuset', type=str, metavar='GPUSET_STR', default='ALL', help='Comma-separated list of GPUs in which to allow bundle execution ' '(e.g., \"0,1\", \"1\").') parser.add_argument('--max-work-dir-size', type=str, metavar='SIZE', default='10g', help='Maximum size of the temporary bundle data ' '(e.g., 3, 3k, 3m, 3g, 3t).') parser.add_argument('--max-dependencies-serialized-length', type=int, default=60000, help='Maximum length of serialized json of dependency list of worker ' '(e.g., 50, 30000, 60000).') parser.add_argument('--max-image-cache-size', type=str, metavar='SIZE', help='Limit the disk space used to cache Docker images ' 'for worker jobs to the specified amount (e.g. ' '3, 3k, 3m, 3g, 3t). If the limit is exceeded, ' 'the least recently used images are removed first. ' 'Worker will not remove any images if this option ' 'is not specified.') parser.add_argument('--password-file', help='Path to the file containing the username and ' 'password for logging into the bundle service, ' 'each on a separate line. If not specified, the ' 'password is read from standard input.') parser.add_argument('--verbose', action='store_true', help='Whether to output verbose log messages.') parser.add_argument('--id', default='%s(%d)' % (socket.gethostname(), os.getpid()), help='Internal use: ID to use for the worker.') parser.add_argument('--shared-file-system', action='store_true', help='Internal use: Whether the file system containing ' 'bundle data is shared between the bundle service ' 'and the worker.') parser.add_argument('--batch-queue', help='Name of the AWS Batch queue to use for run submission. ' 'Providing this option will cause runs to be submitted to Batch rather than local docker. ' 'The queue must already exist and you must have AWS credentials to submit to it.' ) args = parser.parse_args() # Get the username and password. logger.info('Connecting to %s' % args.server) if args.password_file: if os.stat(args.password_file).st_mode & (stat.S_IRWXG | stat.S_IRWXO): print >>sys.stderr, """ Permissions on password file are too lax. Only the user should be allowed to access the file. On Linux, run: chmod 600 %s""" % args.password_file exit(1) with open(args.password_file) as f: username = f.readline().strip() password = f.readline().strip() else: username = os.environ.get('CODALAB_USERNAME') if username is None: username = raw_input('Username: '******'CODALAB_PASSWORD') if password is None: password = getpass.getpass() # Set up logging. if args.verbose: logging.basicConfig(format='%(asctime)s %(message)s', level=logging.DEBUG) else: logging.basicConfig(format='%(asctime)s %(message)s', level=logging.INFO) max_work_dir_size_bytes = parse_size(args.max_work_dir_size) max_dependencies_serialized_length = args.max_dependencies_serialized_length if args.max_image_cache_size is None: max_images_bytes = None else: max_images_bytes = parse_size(args.max_image_cache_size) bundle_service = BundleServiceClient(args.server, username, password) # TODO Break the dependency of RunManagers on Worker to make this initialization nicer def create_run_manager(w): if args.batch_queue is None: # We defer importing the run managers so their dependencies are lazily loaded from docker_run import DockerRunManager from docker_client import DockerClient from docker_image_manager import DockerImageManager logging.info("Using local docker client for run submission.") docker = DockerClient() image_manager = DockerImageManager(docker, args.work_dir, max_images_bytes) cpuset = parse_cpuset_args(args.cpuset) gpuset = parse_gpuset_args(docker, args.gpuset) return DockerRunManager(docker, bundle_service, image_manager, w, args.network_prefix, cpuset, gpuset) else: try: import boto3 except ImportError: logging.exception("Missing dependencies, please install boto3 to enable AWS support.") import sys sys.exit(1) from aws_batch import AwsBatchRunManager logging.info("Using AWS Batch queue %s for run submission.", args.batch_queue) batch_client = boto3.client('batch') return AwsBatchRunManager(batch_client, args.batch_queue, bundle_service, w) worker = Worker(args.id, args.tag, args.work_dir, max_work_dir_size_bytes, max_dependencies_serialized_length, args.shared_file_system, bundle_service, create_run_manager) # Register a signal handler to ensure safe shutdown. for sig in [signal.SIGTERM, signal.SIGINT, signal.SIGHUP]: signal.signal(sig, lambda signup, frame: worker.signal()) # BEGIN: DO NOT CHANGE THIS LINE UNLESS YOU KNOW WHAT YOU ARE DOING # THIS IS HERE TO KEEP TEST-CLI FROM HANGING print('Worker started.') # END worker.run()
def start_container(self, bundle_path, uuid, command, docker_image, network_name, dependencies, cpuset, gpuset, memory_bytes=0): # Impose a minimum container request memory 4mb, same as docker's minimum allowed value # https://docs.docker.com/config/containers/resource_constraints/#limit-a-containers-access-to-memory # When using the REST api, it is allowed to set Memory to 0 but that means the container has unbounded # access to the host machine's memory, which we have decided to not allow if memory_bytes < parse_size('4m'): raise DockerException( 'Minimum memory must be 4m ({} bytes)'.format( parse_size('4m'))) docker_commands = self._get_docker_commands(bundle_path, uuid, command, docker_image, dependencies) volume_bindings = self._get_volume_bindings(bundle_path, uuid, command, docker_image, dependencies) # Get user/group that owns the bundle directory # Then we can ensure that any created files are owned by the user/group # that owns the bundle directory, not root. bundle_stat = os.stat(bundle_path) uid = bundle_stat.st_uid gid = bundle_stat.st_gid docker_bundle_path = '/' + uuid # Create the container. create_request = { 'Cmd': ['bash', '-c', '; '.join(docker_commands)], 'Image': docker_image, 'WorkingDir': docker_bundle_path, 'Env': ['HOME=%s' % docker_bundle_path], 'Entrypoint': [''], # unset entry point regardless of image 'HostConfig': { 'Binds': volume_bindings, 'NetworkMode': network_name, 'Memory': memory_bytes, # hard memory limit 'CpusetCpus': ','.join([str(k) for k in cpuset]), }, # TODO: Fix potential permissions issues arising from this setting # This can cause problems if users expect to run as a specific user 'User': '******' % (uid, gid), } if self._use_nvidia_docker: # Allocate the requested number of GPUs and isolate self._add_nvidia_docker_arguments(create_request, [str(k) for k in gpuset]) with closing(self._create_connection()) as create_conn: create_conn.request('POST', '/containers/create', json.dumps(create_request), {'Content-Type': 'application/json'}) create_response = create_conn.getresponse() if create_response.status != 201: raise DockerException(create_response.read()) container_id = json.loads(create_response.read())['Id'] # Start the container. logger.debug( 'Starting Docker container for UUID %s with command %s, container ID %s', uuid, command, container_id) with closing(self._create_connection()) as start_conn: start_conn.request('POST', '/containers/%s/start' % container_id) start_response = start_conn.getresponse() if start_response.status != 204: raise DockerException(start_response.read()) return container_id
def requested_memory_bytes(self): """ Return request_memory, or 4 megabytes if None (this is for backwards compatibility """ return self._resources['request_memory'] or parse_size('4m')
def requested_memory_bytes(self): """ If request_memory is defined, then return that, otherwise return 4m (for backwards compatibility) """ return self.resources.get('request_memory') or parse_size('4m')