Exemple #1
0
def handle_event_register_prog(event):
    """
    handle 'register_prog' event
    data must include 'name', 'checksum' and 'data'
    :event: event to handle
    :returns: event result data if event sucessfully handled
    :raises: ValueError: if program data does not match checksum
    """
    data = event.data['data']
    name = event.data['name']
    checksum = event.data['checksum']
    with client.client_access() as c:
        user_progs_dir = c.user_progs_dir
        hardware = c.hardware
    prog_dir = os.path.join(user_progs_dir, checksum)
    data_file = os.path.join(prog_dir, 'data.json')
    os.mkdir(prog_dir)
    with open(data_file, 'w') as fp:
        fp.write(data)
    program = user_prog.UserProg(
        name, checksum, data_file, hardware, build_dir=prog_dir)
    program.verify_checksum()
    with client.client_access() as c:
        cuda_bin = c.args.bin
        include_path = c.args.include
    program.build(cuda_bin=cuda_bin, include_path=include_path)
    with client.client_access() as c:
        c.user_programs[checksum] = program
    LOG.info('Registered program: %s', program)
    return {}
def run_server(args, tmpdir):
    """
    entrypoint for server
    :args: parsed cmdline args
    :tmpdir: temporary directory
    :returns: 0 on success
    """
    # create server state
    server.create_state(args, tmpdir)
    # init remote event system
    remote_event.create_remote_events()
    # start api server
    call = functools.partial(
        server.APP.run, debug=False, host=args.host, port=args.port)
    thread = threading.Thread(target=call)
    thread.daemon = True
    thread.start()
    # start server worker
    worker = server_worker.ServerWorker()
    LOG.info('starting server worker')
    try:
        worker.run()
    except queue.Empty:
        server_util.shutdown_all_clients()
    return 0
def check_gpus(args, tmpdir):
    """
    check for CUDA capable GPUs
    :args: parsed cmdline args
    :tmpdir: temporary directory
    :returns: dict with GPU info
    """
    if args.no_gpu:
        LOG.warning("Not scanning available gpus, running programs will fail")
        return {'num_gpus': 0, 'gpu_info': []}
    LOG.info('Checking CUDA build system')
    program = setup_cuda_detect(args, tmpdir)
    res = {
        'num_gpus': program.get_num_gpus(),
        'gpu_info': [],
    }
    for gpu_index in range(res['num_gpus']):
        props = GPUProps()
        program.get_gpu_data(gpu_index, ctypes.byref(props))
        gpu_info = {
            'gpu_index': props.gpu_index,
            'comp_level_major': props.comp_level_major,
            'comp_level_minor': props.comp_level_minor,
            'sm_count': props.sm_count,
            'max_sm_threads': props.max_sm_threads,
            'max_sm_blocks': props.max_sm_blocks,
            'max_block_size': props.max_block_size,
            'max_total_threads': props.max_total_threads,
            'max_total_blocks': props.max_total_blocks,
            'name': props.name.decode(),
        }
        gpu_info['reasonable_block_size'] = get_reasonable_block_size(gpu_info)
        res['gpu_info'].append(gpu_info)
    return res
Exemple #4
0
 def __enter__(self):
     """
     Create tempdir
     :returns: tempdir path
     """
     self.tempdir = tempfile.mkdtemp(prefix=self.prefix)
     LOG.debug('Using tempdir: %s', self.tempdir)
     return self.tempdir
Exemple #5
0
 def __exit__(self, etype, value, trace):
     """
     Destroy tempdir if no errors occurred or preserve set
     :etype: exception type
     :value: exception value
     :trace: exception traceback
     """
     if etype or self.preserve:
         LOG.info('Preserving tempdir: %s', self.tempdir)
     else:
         shutil.rmtree(self.tempdir)
Exemple #6
0
 def unregister_client(self, client_uuid):
     """
     unregister a client with the server
     :client_uuid: client uuid
     :returns: client object
     """
     self.all_clients_hardware.pop(client_uuid, None)
     client = self.clients.pop(client_uuid, None)
     if client:
         LOG.info('Deleted client: %s', client)
     return client
Exemple #7
0
 def unpack(self, item_keys):
     """
     unpack program files and set up build dir structure
     :item_keys: items to unpack
     """
     LOG.debug('Extracting user program code')
     for key in item_keys:
         filename = PROGRAM_SOURCE_FILE_NAMES[key]
         code = self.data['code'][key]
         path = os.path.join(self.build_dir, filename)
         with open(path, 'w') as fp:
             fp.write(code)
Exemple #8
0
 def run_iteration(self, global_state_enc):
     """
     update global state, run iteration, and encode aggregation result
     :global_state_enc: encoded global state
     :returns: encoded aggregation result
     """
     LOG.info("Running iteration")
     # FIXME move into py_mod
     import array
     self.global_state = array.array('d', global_state_enc[1])
     partial_results = self.py_mod.run_iteration(self.global_params, self.data_count,
             self.global_state, self.pinned_memory, self.dataset)
     return partial_results
Exemple #9
0
 def copy_build_files(self, build_files):
     """
     copy build files from data/build_files into build dir
     :build_files: names of files to copy
     """
     LOG.debug('Copying additional build files')
     for build_file in build_files:
         resource_path = os.path.join(
             PROGRAM_DATA_DIRNAME, 'build_files', build_file)
         data = pkgutil.get_data('lizard', resource_path)
         path = os.path.join(self.build_dir, build_file)
         with open(path, 'wb') as fp:
             fp.write(data)
Exemple #10
0
def _normalize_cuda_args(args):
    """normalize cuda arguments"""
    if args.bin:
        args.bin = os.path.abspath(args.bin)
        if not os.path.isdir(args.bin):
            LOG.error('invalid bin path specified')
            return None
    if args.include:
        args.include = os.path.abspath(args.include)
        if not os.path.isdir(args.include):
            LOG.error('invalid include path specified')
            return None
    return args
def scan_hardware(args, tmpdir):
    """
    scan system hardware
    :args: parsed cmdline args
    :tmpdir: temporary directory
    :returns: dict with hardware info
    """
    hardware = {
        'CPU': check_cpus(),
        'GPU': check_gpus(args, tmpdir),
    }
    LOG.debug('hardware scan found: %s', hardware)
    return hardware
def get_reasonable_block_size(props, size_mult=32):
    """
    get reasonable cuda block size
    :props: gpu properties dict
    :size_mult: block size multiple
    :returns: reasonable block size
    """
    max_reasonable_size = props['max_block_size']
    min_reasonable_size = props['max_sm_threads'] / props['max_sm_blocks']
    avg_reasonable_size = (max_reasonable_size + min_reasonable_size) / 2
    reasonable_block_size = int(avg_reasonable_size/size_mult) * size_mult
    LOG.debug('Using CUDA block size: %s', reasonable_block_size)
    return reasonable_block_size
 def run(self):
     """
     main loop, wait for event, then process
     if shutdown scheduled continue until queue empty
     :returns: does not return, uses exception to end control
     :raises queue.Empty: when shutdown requested and queue empty
     """
     while True:
         event = SERVER_QUEUE.get(block=not self.shutdown_scheduled)
         LOG.debug('received event: %s', event)
         self.handle_event(event)
         LOG.debug('handled event: %s', event)
         SERVER_QUEUE.task_done()
Exemple #14
0
    def load_data(self, dataset_enc):
        """
        load dataset
        :dataset_enc: encoded data
        """
        LOG.info('in load_data')
        LOG.info('first line of data:')
        # LOG.info(dataset_enc[0])
        # LOG.info(dataset_enc[1][0])
        self.dataset = self.py_mod.to_array(dataset_enc[1])
        self.data_count = dataset_enc[0]
        LOG.info('converted to array')

        self.pinned_memory = self.prog.pin_gpu_memory(self.dataset)
        LOG.info('data count %i', self.data_count)
        LOG.info('pinned data')
Exemple #15
0
    def partition_data(self, data):
        """
        load dataset and partition among clients
        :data: data
        """
        client_uuids = list(self.hardware.keys())
        client_count = len(client_uuids)
        LOG.debug("data size %i", sys.getsizeof(data))
        # FIXME this is a really rough estimate as the final calculation is done
        # after casting to double

        data_generator = self.py_mod.split_data(data)
        LOG.info(self.global_params)
        split_size =  self.global_params[0] // client_count + 1
        LOG.debug("split size %i", split_size)
        post_datasets = {}
        for client_uuid in client_uuids:
            LOG.info("Splitting data")
            # FIXME use hardware scan to discover GPU mem size
            # currently rounded slightly down to avoid overflowing in loop
            # 8G gpu ram size
            # gpu_mem_remaining = 8589934592
            gpu_mem_remaining = 8500000000
            split_remaining = split_size
            data_count = 0

            LOG.info("global_params %s", self.global_params)
            dataset = []
            # subtract params size
            gpu_mem_remaining = (gpu_mem_remaining -
                                 sys.getsizeof(self.global_params))
            try:
                while split_remaining > 0 and gpu_mem_remaining > 0:
                    next_split = next(data_generator)
                    split_remaining = split_remaining - 1
                    gpu_mem_remaining = (gpu_mem_remaining -
                                         sys.getsizeof(next_split))
                    dataset.append(next_split)
                    data_count = data_count + 1
            except StopIteration:
                pass

            dataset_enc = [data_count, dataset]
            self.client_datasets[client_uuid] = dataset_enc
        self._initialize_global_state()
def handle_event_register_prog(event):
    """
    handle 'register_prog' event
    data must include 'name', 'checksum' and 'data'
    :event: event to handle
    :returns: event result data if event sucessfully handled
    :raises: Exception: if error occurs handling event
    """
    data = event.data['data']
    name = event.data['name']
    checksum = event.data['checksum']
    wakeup_ev = threading.Event()

    def multi_callback_func(event_props):
        wakeup_ev.set()

    def callback_func(client, event_props):
        if event_props['status'] != events.EventStatus.SUCCESS.value:
            raise ValueError('{}: failed to register program'.format(client))
        client.registered_progs.append(checksum)

    with server.state_access() as s:
        user_progs_dir = s.user_progs_dir
        all_hardware = s.all_clients_hardware
    prog_dir = os.path.join(user_progs_dir, checksum)
    data_file = os.path.join(prog_dir, 'data.json')
    os.mkdir(prog_dir)
    with open(data_file, 'w') as fp:
        fp.write(data)
    program = user_prog.UserProg(
        name, checksum, data_file, all_hardware, build_dir=prog_dir)
    program.build_for_server()
    post_data = event.data.copy()
    post_data['send_remote_event'] = True
    with server.state_access() as s:
        s.post_all('/programs', post_data, callback_func=callback_func,
                   multi_callback_func=multi_callback_func)
    # NOTE: timeout for registering program on all nodes set to 10 min
    wakeup_ev.wait(timeout=600)
    LOG.info('Registered user program: %s', program)
    with server.state_access() as s:
        s.registered_progs[checksum] = program
    return program.properties
Exemple #17
0
 def handle(self):
     """
     Handle event using handler defined in event handler map and set result
     """
     if self.event_handler_map is None:
         raise NotImplementedError("Cannot handle BaseEvent")
     handler = self.event_handler_map.get(
         self.event_type, handler_not_implemented)
     start_time = time.time()
     try:
         self.status = EventStatus.RUNNING
         self.result = handler(self)
         self.status = EventStatus.SUCCESS
     except Exception as e:
         msg = repr(e)
         LOG.warning("Failed to complete event: %s error: %s", self, msg)
         self.status = EventStatus.FAILURE
         self.result = {'error': msg}
     end_time = time.time()
     self.completion_time = end_time - start_time
Exemple #18
0
def main():
    """
    main entry point
    :returns: 0 on success
    """
    subcmd_handlers = {
        'client': run_client,
        'server': run_server,
        'cluster': run_cluster,
    }

    # get the argument parser
    parser = cli.configure_parser()
    # parse arguments
    args = parser.parse_args()
    # normalize arguments
    args = cli.normalize_args(args)
    if args is None:
        return -1

    # set log level
    LOG.setLevel(logging.DEBUG if args.verbose else logging.INFO)
    LOG.debug('logging system init')
    LOG.debug('running with args: %s', args)

    # create tmpdir and run handler
    with util.TempDir(preserve=args.keep_tmpdir) as tmpdir:
        return subcmd_handlers[args.subcmd](args, tmpdir)
Exemple #19
0
def handle_event_init_runtime(event):
    """
    handle 'init_runtime' event
    data must include:
        - 'runtime_id',
        - 'dataset_enc',
        - 'checksum',
        - 'global_params_enc'
    :event: event to handle
    :returns: event result
    """
    runtime_id = event.data['runtime_id']
    dataset_enc = event.data['dataset_enc']
    prog_checksum = event.data['checksum']
    global_params_enc = event.data['global_params_enc']
    with client.client_access() as c:
        program = c.user_programs[prog_checksum]
    runtime = program.get_new_program_runtime(runtime_id)
    runtime.prepare_datastructures(global_params_enc)
    runtime.load_data(dataset_enc)
    LOG.info('Loaded client program instance')
    return {}
Exemple #20
0
def _normalize_server_args(args):
    """normalize server arguments"""
    # ensure port number is valid
    if args.port > 65535:
        LOG.error('port number invalid: %s', args.port)
        return None
    elif args.port < 1024:
        LOG.warning('port number requires root priv: %s', args.port)

    # NOTE: does not support ipv6 bind addrs and may allow some invalid addrs
    if len(args.host.split('.')) != 4:
        LOG.error('invalid bind addr: %s', args.host)
        return None

    return args
def shutdown_all_clients(max_wait=5, wait_interval=0.2):
    """
    shut down all clients because the server is terminating
    :max_wait: max time to wait for clients to shutdown before returning
    :wait_interval: poll interval to check if all clients have terminated
    """
    LOG.info('Instructing all clients to shutdown')
    with server.state_access() as s:
        s.get_all('/shutdown')
    for _ in range(int(max_wait / wait_interval)):
        time.sleep(wait_interval)
        with server.state_access() as s:
            client_count = len(s.clients)
        if client_count == 0:
            LOG.info('All clients terminated')
            break
    else:
        LOG.warn('Not all clients terminated, shutting down anyway')
Exemple #22
0
        self.user_progs_dir = os.path.join(self.tmpdir, 'user_progs_server')
        os.mkdir(self.user_progs_dir)

    def register_client(self, hardware, client_ip, client_port):
        """
        register a client with the server
        :hardware: hardware info dict
        :client_ip: addr of client
        :client_port: port number for client
        :returns: client uuid
        """
        client_uuid = util.hex_uuid()
        url = 'http://{}:{}'.format(client_ip, client_port)
        self.clients[client_uuid] = ClientState(client_uuid, hardware, url)
<<<<<<< HEAD
        LOG.info('Registered client: %s', self.clients[client_uuid])
        return client_uuid

    def get_all(self, endpoint, params=None, expect_json=True):
=======
        with remote_event.remote_events_access() as r:
            r.register_client(client_uuid)
        self.all_clients_hardware[client_uuid] = hardware
        LOG.info('Registered client: %s', self.clients[client_uuid])
        return client_uuid

    def unregister_client(self, client_uuid):
        """
        unregister a client with the server
        :client_uuid: client uuid
        :returns: client object
Exemple #23
0
    client.create_client(args, tmpdir, hardware)
>>>>>>> ef9b13b186c1a356f50a36e78ad91a3ccff76392
    # automatically find available port
    client_port = util.get_free_port()
    # start client api server
    call = functools.partial(
        client.APP.run, debug=False, host='0.0.0.0', port=client_port)
    thread = threading.Thread(target=call)
    thread.daemon = True
    thread.start()
    # register with server
    with client.client_access() as c:
        c.register(client_port)
    # start client worker
    worker = client_worker.ClientWorker()
    LOG.info('starting client worker')
    try:
        worker.run()
    except queue.Empty:
        with client.client_access() as c:
            c.shutdown()
    return 0


def run_server(args, tmpdir):
    """
    entrypoint for server
    :args: parsed cmdline args
    :tmpdir: temporary directory
    :returns: 0 on success
    """
def handle_event_run_program(event):
    """
    handle 'run_program' eent
    :event: event to handle
    :returns: program result
    :raises: Exception: if error occurs or invalid request
    """
    runtime_id = util.hex_uuid()
    dataset_enc = event.data['dataset_enc']
    prog_checksum = event.data['checksum']
    global_params_enc = event.data['global_params_enc']
    init_path = os.path.join('/runtimes', prog_checksum, runtime_id)
    iterate_path = os.path.join(init_path, 'iterate')
    cleanup_path = os.path.join(init_path, 'cleanup')
    wakeup_ev = threading.Event()

    def multi_callback_wakeup(event_props):
        wakeup_ev.set()

    def runtime_init_callback(client, event_props):
        if event_props['status'] != events.EventStatus.SUCCESS.value:
            raise ValueError('{}: error on prog runtime init'.format(client))

    with server.state_access() as s:
        program = s.registered_progs[prog_checksum]
    if not program.ready:
        raise ValueError('cannot run program, not ready')
    runtime = program.get_new_server_runtime(runtime_id)
    runtime.prepare_datastructures(global_params_enc)
    runtime.partition_data(dataset_enc)
    runtime_init_remote_event_ids = []
    for client_uuid, dataset_enc in runtime.dataset_partitions_encoded.items():
        data = {
            'runtime_id': runtime_id,
            'checksum': prog_checksum,
            'dataset_enc': dataset_enc,
            'global_params_enc': global_params_enc,
            'send_remote_event': True,
        }
        with server.state_access() as s:
            c = s.clients[client_uuid]
            res = c.post(init_path, data, callback_func=runtime_init_callback)
            runtime_init_remote_event_ids.append(res['event_id'])
    with remote_event.remote_events_access() as r:
        r.register_multi_callback(
            runtime_init_remote_event_ids, multi_callback_wakeup)
    wakeup_ev.wait(timeout=300)
    wakeup_ev.clear()
    LOG.info('Runtime initialized for user program: %s', program)
    aggregation_lock = threading.Lock()

    def run_iteration_callback(client, event_props):
        if event_props['status'] != events.EventStatus.SUCCESS.value:
            raise ValueError('{}: error running prog iteration'.format(client))
        with aggregation_lock:
            runtime.aggregate(event_props['result']['aggregation_result_enc'])
    runtime.reset_aggregation_result()
    iteration_count = 0
    while True:
        post_data = {
            'runtime_id': runtime_id,
            'checksum': prog_checksum,
            'global_state_enc': runtime.global_state_encoded,
            'send_remote_event': True,
        }
        with server.state_access() as s:
            s.post_all(
                iterate_path, post_data, callback_func=run_iteration_callback,
                multi_callback_func=multi_callback_wakeup)
        wakeup_ev.wait(timeout=600)
        wakeup_ev.clear()
        runtime.update_global_state()
        runtime.reset_aggregation_result()
        LOG.debug('Completed iteration for user program: %s', program)
        iteration_count = iteration_count + 1
        if runtime.done:
            break

    LOG.info('Cleaning up...')
    def runtime_cleanup_callback(client, event_props):
        if event_props['status'] != events.EventStatus.SUCCESS.value:
            raise ValueError('{}: error on prog runtime clean up'.format(client))
    post_data = {
        'runtime_id': runtime_id,
        'checksum': prog_checksum,
        'send_remote_event': True,
    }
    with server.state_access() as s:
        s.post_all(
            cleanup_path, post_data, callback_func=runtime_cleanup_callback,
            multi_callback_func=multi_callback_wakeup)
    wakeup_ev.wait(timeout=60)
    wakeup_ev.clear()

    LOG.info('Finished running user program: %s %i', program, iteration_count)
    return {
        'end_aggregate': runtime.top_level_aggregate_encoded,
        'end_global_state': runtime.global_state_encoded,
    }
Exemple #25
0
def client_item(client_id):
    """
    GET,DELETE /clients/<client_id>: query clients
    :client_id: client uuid
    :returns: flask response
    """
    if request.method == 'GET':
        with server.state_access() as state:
            client = state.clients.get(client_id)
        return (respond_json(client.properties) if client else
                respond_error(404))
    elif request.method == 'DELETE':
        with server.state_access() as state:
<<<<<<< HEAD
            res = state.clients.pop(client_id, None)
            LOG.info('Deleted client: %s', res)
        return Response("ok") if res is not None else respond_error(404)
=======
            client = state.unregister_client(client_id)
        return Response("ok") if client is not None else respond_error(404)


@APP.route('/programs', methods=['GET', 'POST'])
def programs():
    """
    GET,POST /programs: register or list programs
    :returns: flask response
    """
    if request.method == 'POST':
        event_data = request.get_json()
        if not all(n in event_data for n in ('name', 'data', 'checksum')):
Exemple #26
0
    def build(
            self, cuda_bin=None, include_path=None, unpack=True,
            set_compute_level=True):
        """
        set up user program resources and build shared obj
        :cuda_bin: path to cuda tools bin
        :include_path: path to cuda include dir
        :unpack: if true, unpack program json
        :set_compute_level: if true, specify appropriate compute level
        """
        if not self.build_dir or not os.path.isdir(self.build_dir):
            raise ValueError("Build dir not set up")
        if unpack:
            files = ['cuda', 'python', 'header']
            if self.use_c_extention:
                files.append('cpp')
            self.unpack(files)
        build_files = ['Makefile']
        if self.use_c_extention:
            build_files.append('setup.py')
        self.copy_build_files(build_files)
        make_cmd = ['make', '-C', self.build_dir]
        if cuda_bin is not None:
            nvcc_path = os.path.join(cuda_bin, 'nvcc')
            make_cmd.append('NVCC={}'.format(nvcc_path))
        if include_path is not None:
            make_cmd.append('CUDA_L64=-L{}'.format(include_path))
        if set_compute_level:
            flag_value = '-arch={}'.format(self.compute_level)
            make_cmd.append('COMPUTE_LEVEL_FLAG={}'.format(flag_value))
            LOG.debug('Using compute level: %s', flag_value)
        else:
            LOG.warning('Using default compute level, not optimized')
        LOG.debug('Building CUDA shared object')
        util.subp(make_cmd)

        if self.use_c_extention:
            LOG.debug('Building Python wrapper module')

            # XXX
            # FIXME create hardcoded tmp dir used by dynamic linker
            shared_dll = 'user_program_cuda.so'
            tmp_dir = '/tmp/lizard-slayer/'
            pathlib.Path(tmp_dir).mkdir(exist_ok=True)
            for the_file in os.listdir(tmp_dir):
                file_path = os.path.join(tmp_dir, the_file)
                if os.path.isfile(file_path):
                    os.unlink(file_path)
                elif os.path.isdir(file_path):
                    shutil.rmtree(file_path)

            setup_cmd = ['python3', 'setup.py', 'build_ext', '-b', tmp_dir]

            util.subp(setup_cmd, cwd=self.build_dir)
            # copy over the shared library to be found by the linker
            shutil.copyfile(os.path.join(self.build_dir, shared_dll),
                            os.path.join(tmp_dir, shared_dll))
            # FIXME remove path
            sys.path.append(tmp_dir)
            sys.path.append(self.build_dir)
            self.ready = True
        else:
            LOG.debug('No python c extention for user program')
            self.ready = True