Example #1
0
            print >> sys.stderr, """
Permissions on password file are too lax.
Only the user should be allowed to access the file.
On Linux, run:
chmod 600 %s""" % args.password_file
            exit(1)
        with open(args.password_file) as f:
            username = f.readline().strip()
            password = f.readline().strip()
    else:
        username = raw_input('Username: '******'%(asctime)s %(message)s',
                            level=logging.DEBUG)

    max_work_dir_size_bytes = parse_size(args.max_work_dir_size)
    worker = Worker(args.id, args.tag, args.work_dir, max_work_dir_size_bytes,
                    args.shared_file_system, args.slots,
                    BundleServiceClient(args.server, username, password),
                    DockerClient())

    # Register a signal handler to ensure safe shutdown.
    for sig in [signal.SIGTERM, signal.SIGINT, signal.SIGHUP]:
        signal.signal(sig, lambda signup, frame: worker.signal())

    print 'Worker started.'
    worker.run()
Example #2
0
def main():
    parser = argparse.ArgumentParser(description='CodaLab worker.')
    parser.add_argument('--tag', help='Tag that allows for scheduling runs on specific ' 'workers.')
    parser.add_argument(
        '--server',
        default='https://worksheets.codalab.org',
        help='URL of the CodaLab server, in the format '
        '<http|https>://<hostname>[:<port>] (e.g., https://worksheets.codalab.org)',
    )
    parser.add_argument(
        '--work-dir',
        default='codalab-worker-scratch',
        help='Directory where to store temporary bundle data, '
        'including dependencies and the data from run '
        'bundles.',
    )
    parser.add_argument(
        '--network-prefix', default='codalab_worker_network', help='Docker network name prefix'
    )
    parser.add_argument(
        '--cpuset',
        type=str,
        metavar='CPUSET_STR',
        default='ALL',
        help='Comma-separated list of CPUs in which to allow bundle execution, '
        '(e.g., \"0,2,3\", \"1\").',
    )
    parser.add_argument(
        '--gpuset',
        type=str,
        metavar='GPUSET_STR',
        default='ALL',
        help='Comma-separated list of GPUs in which to allow bundle execution. '
        'Each GPU can be specified by its index or UUID'
        '(e.g., \"0,1\", \"1\", \"GPU-62casdfasd-asfas...\"',
    )
    parser.add_argument(
        '--max-work-dir-size',
        type=str,
        metavar='SIZE',
        default='10g',
        help='Maximum size of the temporary bundle data ' '(e.g., 3, 3k, 3m, 3g, 3t).',
    )
    parser.add_argument(
        '--max-image-cache-size',
        type=str,
        metavar='SIZE',
        help='Limit the disk space used to cache Docker images '
        'for worker jobs to the specified amount (e.g. '
        '3, 3k, 3m, 3g, 3t). If the limit is exceeded, '
        'the least recently used images are removed first. '
        'Worker will not remove any images if this option '
        'is not specified.',
    )
    parser.add_argument(
        '--password-file',
        help='Path to the file containing the username and '
        'password for logging into the bundle service, '
        'each on a separate line. If not specified, the '
        'password is read from standard input.',
    )
    parser.add_argument(
        '--verbose', action='store_true', help='Whether to output verbose log messages.'
    )
    parser.add_argument(
        '--exit-when-idle',
        action='store_true',
        help='If specified the worker quits if it finds itself with no jobs after a checkin',
    )
    parser.add_argument(
        '--id',
        default='%s(%d)' % (socket.gethostname(), os.getpid()),
        help='Internal use: ID to use for the worker.',
    )
    parser.add_argument(
        '--shared-file-system',
        action='store_true',
        help='Internal use: Whether the file system containing '
        'bundle data is shared between the bundle service '
        'and the worker.',
    )
    args = parser.parse_args()

    # Get the username and password.
    logger.info('Connecting to %s' % args.server)
    if args.password_file:
        if os.stat(args.password_file).st_mode & (stat.S_IRWXG | stat.S_IRWXO):
            print >>sys.stderr, """
Permissions on password file are too lax.
Only the user should be allowed to access the file.
On Linux, run:
chmod 600 %s""" % args.password_file
            exit(1)
        with open(args.password_file) as f:
            username = f.readline().strip()
            password = f.readline().strip()
    else:
        username = os.environ.get('CODALAB_USERNAME')
        if username is None:
            username = raw_input('Username: '******'CODALAB_PASSWORD')
        if password is None:
            password = getpass.getpass()

    # Set up logging.
    if args.verbose:
        logging.basicConfig(format='%(asctime)s %(message)s', level=logging.DEBUG)
    else:
        logging.basicConfig(format='%(asctime)s %(message)s', level=logging.INFO)

    try:
        bundle_service = BundleServiceClient(args.server, username, password)
    except BundleAuthException as ex:
        logger.error('Cannot log into the bundle service. Please check your worker credentials.\n')
        logger.debug('Auth error: {}'.format(ex))
        return

    max_work_dir_size_bytes = parse_size(args.max_work_dir_size)
    if args.max_image_cache_size is None:
        max_images_bytes = None
    else:
        max_images_bytes = parse_size(args.max_image_cache_size)

    if not os.path.exists(args.work_dir):
        logging.debug('Work dir %s doesn\'t exist, creating.', args.work_dir)
        os.makedirs(args.work_dir, 0o770)

    def create_local_run_manager(worker):
        """
        To avoid circular dependencies the Worker initializes takes a RunManager factory
        to initilize its run manager. This method creates a LocalFilesystem-Docker RunManager
        which is the default execution architecture Codalab uses
        """
        docker_runtime = docker_utils.get_available_runtime()
        cpuset = parse_cpuset_args(args.cpuset)
        gpuset = parse_gpuset_args(args.gpuset)

        dependency_manager = LocalFileSystemDependencyManager(
            os.path.join(args.work_dir, 'dependencies-state.json'),
            bundle_service,
            args.work_dir,
            max_work_dir_size_bytes,
        )

        image_manager = DockerImageManager(
            os.path.join(args.work_dir, 'images-state.json'), max_images_bytes
        )

        return LocalRunManager(
            worker,
            image_manager,
            dependency_manager,
            os.path.join(args.work_dir, 'run-state.json'),
            cpuset,
            gpuset,
            args.work_dir,
            docker_runtime=docker_runtime,
            docker_network_prefix=args.network_prefix,
        )

    worker = Worker(
        create_local_run_manager,
        os.path.join(args.work_dir, 'worker-state.json'),
        args.id,
        args.tag,
        args.work_dir,
        args.exit_when_idle,
        bundle_service,
    )

    # Register a signal handler to ensure safe shutdown.
    for sig in [signal.SIGTERM, signal.SIGINT, signal.SIGHUP]:
        signal.signal(sig, lambda signup, frame: worker.signal())

    # BEGIN: DO NOT CHANGE THIS LINE UNLESS YOU KNOW WHAT YOU ARE DOING
    # THIS IS HERE TO KEEP TEST-CLI FROM HANGING
    logger.info('Worker started!')
    # END

    worker.start()
Example #3
0
def main():
    parser = argparse.ArgumentParser(description='CodaLab worker.')
    parser.add_argument('--tag',
                        help='Tag that allows for scheduling runs on specific '
                        'workers.')
    parser.add_argument(
        '--server',
        default='https://worksheets.codalab.org',
        help='URL of the CodaLab server, in the format '
        '<http|https>://<hostname>[:<port>] (e.g., https://worksheets.codalab.org)'
    )
    parser.add_argument('--work-dir',
                        default='codalab-worker-scratch',
                        help='Directory where to store temporary bundle data, '
                        'including dependencies and the data from run '
                        'bundles.')
    parser.add_argument('--max-work-dir-size',
                        type=str,
                        metavar='SIZE',
                        default='10g',
                        help='Maximum size of the temporary bundle data '
                        '(e.g., 3, 3k, 3m, 3g, 3t).')
    parser.add_argument(
        '--max-image-cache-size',
        type=str,
        metavar='SIZE',
        help='Limit the disk space used to cache Docker images '
        'for worker jobs to the specified amount (e.g. '
        '3, 3k, 3m, 3g, 3t). If the limit is exceeded, '
        'the least recently used images are removed first. '
        'Worker will not remove any images if this option '
        'is not specified.')
    parser.add_argument('--slots',
                        type=int,
                        default=1,
                        help='Number of slots to use for running bundles. '
                        'A single bundle takes up a single slot.')
    parser.add_argument('--password-file',
                        help='Path to the file containing the username and '
                        'password for logging into the bundle service, '
                        'each on a separate line. If not specified, the '
                        'password is read from standard input.')
    parser.add_argument('--verbose',
                        action='store_true',
                        help='Whether to output verbose log messages.')
    parser.add_argument('--id',
                        default='%s(%d)' % (socket.gethostname(), os.getpid()),
                        help='Internal use: ID to use for the worker.')
    parser.add_argument(
        '--shared-file-system',
        action='store_true',
        help='Internal use: Whether the file system containing '
        'bundle data is shared between the bundle service '
        'and the worker.')
    args = parser.parse_args()

    # Get the username and password.
    logger.info('Connecting to %s' % args.server)
    if args.password_file:
        if os.stat(args.password_file).st_mode & (stat.S_IRWXG | stat.S_IRWXO):
            print >> sys.stderr, """
Permissions on password file are too lax.
Only the user should be allowed to access the file.
On Linux, run:
chmod 600 %s""" % args.password_file
            exit(1)
        with open(args.password_file) as f:
            username = f.readline().strip()
            password = f.readline().strip()
    else:
        username = os.environ.get('CODALAB_USERNAME')
        if username is None:
            username = raw_input('Username: '******'CODALAB_PASSWORD')
        if password is None:
            password = getpass.getpass()

    # Set up logging.
    if args.verbose:
        logging.basicConfig(format='%(asctime)s %(message)s',
                            level=logging.DEBUG)
    else:
        logging.basicConfig(format='%(asctime)s %(message)s',
                            level=logging.INFO)

    max_work_dir_size_bytes = parse_size(args.max_work_dir_size)
    if args.max_image_cache_size is None:
        max_images_bytes = None
    else:
        max_images_bytes = parse_size(args.max_image_cache_size)
    worker = Worker(args.id, args.tag, args.work_dir, max_work_dir_size_bytes,
                    max_images_bytes, args.shared_file_system, args.slots,
                    BundleServiceClient(args.server, username, password),
                    DockerClient())

    # Register a signal handler to ensure safe shutdown.
    for sig in [signal.SIGTERM, signal.SIGINT, signal.SIGHUP]:
        signal.signal(sig, lambda signup, frame: worker.signal())

    logger.info('Worker started.')
    worker.run()
Example #4
0
def main():
    parser = argparse.ArgumentParser(description='CodaLab worker.')
    parser.add_argument('--tag',
                        help='Tag that allows for scheduling runs on specific '
                             'workers.')
    parser.add_argument('--server', default='https://worksheets.codalab.org',
                        help='URL of the CodaLab server, in the format '
                             '<http|https>://<hostname>[:<port>] (e.g., https://worksheets.codalab.org)')
    parser.add_argument('--work-dir', default='codalab-worker-scratch',
                        help='Directory where to store temporary bundle data, '
                             'including dependencies and the data from run '
                             'bundles.')
    parser.add_argument('--network-prefix', default='codalab_worker_network',
                        help='Docker network name prefix')
    parser.add_argument('--cpuset', type=str, metavar='CPUSET_STR', default='ALL',
                        help='Comma-separated list of CPUs in which to allow bundle execution, '
                             '(e.g., \"0,2,3\", \"1\").')
    parser.add_argument('--gpuset', type=str, metavar='GPUSET_STR', default='ALL',
                        help='Comma-separated list of GPUs in which to allow bundle execution '
                             '(e.g., \"0,1\", \"1\").')
    parser.add_argument('--max-work-dir-size', type=str, metavar='SIZE', default='10g',
                        help='Maximum size of the temporary bundle data '
                             '(e.g., 3, 3k, 3m, 3g, 3t).')
    parser.add_argument('--max-dependencies-serialized-length', type=int, default=60000,
                        help='Maximum length of serialized json of dependency list of worker '
                             '(e.g., 50, 30000, 60000).')
    parser.add_argument('--max-image-cache-size', type=str, metavar='SIZE',
                        help='Limit the disk space used to cache Docker images '
                             'for worker jobs to the specified amount (e.g. '
                             '3, 3k, 3m, 3g, 3t). If the limit is exceeded, '
                             'the least recently used images are removed first. '
                             'Worker will not remove any images if this option '
                             'is not specified.')
    parser.add_argument('--password-file',
                        help='Path to the file containing the username and '
                             'password for logging into the bundle service, '
                             'each on a separate line. If not specified, the '
                             'password is read from standard input.')
    parser.add_argument('--verbose', action='store_true',
                        help='Whether to output verbose log messages.')
    parser.add_argument('--id', default='%s(%d)' % (socket.gethostname(), os.getpid()),
                        help='Internal use: ID to use for the worker.')
    parser.add_argument('--shared-file-system', action='store_true',
                        help='Internal use: Whether the file system containing '
                             'bundle data is shared between the bundle service '
                             'and the worker.')
    parser.add_argument('--batch-queue',
                        help='Name of the AWS Batch queue to use for run submission. '
                             'Providing this option will cause runs to be submitted to Batch rather than local docker. '
                             'The queue must already exist and you must have AWS credentials to submit to it.'
                        )
    args = parser.parse_args()

    # Get the username and password.
    logger.info('Connecting to %s' % args.server)
    if args.password_file:
        if os.stat(args.password_file).st_mode & (stat.S_IRWXG | stat.S_IRWXO):
            print >>sys.stderr, """
Permissions on password file are too lax.
Only the user should be allowed to access the file.
On Linux, run:
chmod 600 %s""" % args.password_file
            exit(1)
        with open(args.password_file) as f:
            username = f.readline().strip()
            password = f.readline().strip()
    else:
        username = os.environ.get('CODALAB_USERNAME')
        if username is None:
            username = raw_input('Username: '******'CODALAB_PASSWORD')
        if password is None:
            password = getpass.getpass()

    # Set up logging.
    if args.verbose:
        logging.basicConfig(format='%(asctime)s %(message)s',
                            level=logging.DEBUG)
    else:
        logging.basicConfig(format='%(asctime)s %(message)s',
                            level=logging.INFO)

    max_work_dir_size_bytes = parse_size(args.max_work_dir_size)
    max_dependencies_serialized_length = args.max_dependencies_serialized_length
    if args.max_image_cache_size is None:
        max_images_bytes = None
    else:
        max_images_bytes = parse_size(args.max_image_cache_size)

    bundle_service = BundleServiceClient(args.server, username, password)

    # TODO Break the dependency of RunManagers on Worker to make this initialization nicer
    def create_run_manager(w):
        if args.batch_queue is None:
            # We defer importing the run managers so their dependencies are lazily loaded
            from docker_run import DockerRunManager
            from docker_client import DockerClient
            from docker_image_manager import DockerImageManager

            logging.info("Using local docker client for run submission.")

            docker = DockerClient()
            image_manager = DockerImageManager(docker, args.work_dir, max_images_bytes)
            cpuset = parse_cpuset_args(args.cpuset)
            gpuset = parse_gpuset_args(docker, args.gpuset)
            return DockerRunManager(docker, bundle_service, image_manager, w, args.network_prefix, cpuset, gpuset)
        else:
            try:
                import boto3
            except ImportError:
                logging.exception("Missing dependencies, please install boto3 to enable AWS support.")
                import sys
                sys.exit(1)

            from aws_batch import AwsBatchRunManager

            logging.info("Using AWS Batch queue %s for run submission.", args.batch_queue)

            batch_client = boto3.client('batch')
            return AwsBatchRunManager(batch_client, args.batch_queue, bundle_service, w)

    worker = Worker(args.id, args.tag, args.work_dir, max_work_dir_size_bytes, max_dependencies_serialized_length,
                    args.shared_file_system, bundle_service, create_run_manager)

    # Register a signal handler to ensure safe shutdown.
    for sig in [signal.SIGTERM, signal.SIGINT, signal.SIGHUP]:
        signal.signal(sig, lambda signup, frame: worker.signal())

    # BEGIN: DO NOT CHANGE THIS LINE UNLESS YOU KNOW WHAT YOU ARE DOING
    # THIS IS HERE TO KEEP TEST-CLI FROM HANGING
    print('Worker started.')
    # END

    worker.run()