Ejemplo n.º 1
0
def handle_event_register_prog(event):
    """
    handle 'register_prog' event
    data must include 'name', 'checksum' and 'data'
    :event: event to handle
    :returns: event result data if event sucessfully handled
    :raises: ValueError: if program data does not match checksum
    """
    data = event.data['data']
    name = event.data['name']
    checksum = event.data['checksum']
    with client.client_access() as c:
        user_progs_dir = c.user_progs_dir
        hardware = c.hardware
    prog_dir = os.path.join(user_progs_dir, checksum)
    data_file = os.path.join(prog_dir, 'data.json')
    os.mkdir(prog_dir)
    with open(data_file, 'w') as fp:
        fp.write(data)
    program = user_prog.UserProg(
        name, checksum, data_file, hardware, build_dir=prog_dir)
    program.verify_checksum()
    with client.client_access() as c:
        cuda_bin = c.args.bin
        include_path = c.args.include
    program.build(cuda_bin=cuda_bin, include_path=include_path)
    with client.client_access() as c:
        c.user_programs[checksum] = program
    LOG.info('Registered program: %s', program)
    return {}
Ejemplo n.º 2
0
def run_server(args, tmpdir):
    """
    entrypoint for server
    :args: parsed cmdline args
    :tmpdir: temporary directory
    :returns: 0 on success
    """
    # create server state
    server.create_state(args, tmpdir)
    # init remote event system
    remote_event.create_remote_events()
    # start api server
    call = functools.partial(
        server.APP.run, debug=False, host=args.host, port=args.port)
    thread = threading.Thread(target=call)
    thread.daemon = True
    thread.start()
    # start server worker
    worker = server_worker.ServerWorker()
    LOG.info('starting server worker')
    try:
        worker.run()
    except queue.Empty:
        server_util.shutdown_all_clients()
    return 0
def check_gpus(args, tmpdir):
    """
    check for CUDA capable GPUs
    :args: parsed cmdline args
    :tmpdir: temporary directory
    :returns: dict with GPU info
    """
    if args.no_gpu:
        LOG.warning("Not scanning available gpus, running programs will fail")
        return {'num_gpus': 0, 'gpu_info': []}
    LOG.info('Checking CUDA build system')
    program = setup_cuda_detect(args, tmpdir)
    res = {
        'num_gpus': program.get_num_gpus(),
        'gpu_info': [],
    }
    for gpu_index in range(res['num_gpus']):
        props = GPUProps()
        program.get_gpu_data(gpu_index, ctypes.byref(props))
        gpu_info = {
            'gpu_index': props.gpu_index,
            'comp_level_major': props.comp_level_major,
            'comp_level_minor': props.comp_level_minor,
            'sm_count': props.sm_count,
            'max_sm_threads': props.max_sm_threads,
            'max_sm_blocks': props.max_sm_blocks,
            'max_block_size': props.max_block_size,
            'max_total_threads': props.max_total_threads,
            'max_total_blocks': props.max_total_blocks,
            'name': props.name.decode(),
        }
        gpu_info['reasonable_block_size'] = get_reasonable_block_size(gpu_info)
        res['gpu_info'].append(gpu_info)
    return res
Ejemplo n.º 4
0
 def unregister_client(self, client_uuid):
     """
     unregister a client with the server
     :client_uuid: client uuid
     :returns: client object
     """
     self.all_clients_hardware.pop(client_uuid, None)
     client = self.clients.pop(client_uuid, None)
     if client:
         LOG.info('Deleted client: %s', client)
     return client
Ejemplo n.º 5
0
 def __exit__(self, etype, value, trace):
     """
     Destroy tempdir if no errors occurred or preserve set
     :etype: exception type
     :value: exception value
     :trace: exception traceback
     """
     if etype or self.preserve:
         LOG.info('Preserving tempdir: %s', self.tempdir)
     else:
         shutil.rmtree(self.tempdir)
Ejemplo n.º 6
0
 def run_iteration(self, global_state_enc):
     """
     update global state, run iteration, and encode aggregation result
     :global_state_enc: encoded global state
     :returns: encoded aggregation result
     """
     LOG.info("Running iteration")
     # FIXME move into py_mod
     import array
     self.global_state = array.array('d', global_state_enc[1])
     partial_results = self.py_mod.run_iteration(self.global_params, self.data_count,
             self.global_state, self.pinned_memory, self.dataset)
     return partial_results
Ejemplo n.º 7
0
    def load_data(self, dataset_enc):
        """
        load dataset
        :dataset_enc: encoded data
        """
        LOG.info('in load_data')
        LOG.info('first line of data:')
        # LOG.info(dataset_enc[0])
        # LOG.info(dataset_enc[1][0])
        self.dataset = self.py_mod.to_array(dataset_enc[1])
        self.data_count = dataset_enc[0]
        LOG.info('converted to array')

        self.pinned_memory = self.prog.pin_gpu_memory(self.dataset)
        LOG.info('data count %i', self.data_count)
        LOG.info('pinned data')
Ejemplo n.º 8
0
def shutdown_all_clients(max_wait=5, wait_interval=0.2):
    """
    shut down all clients because the server is terminating
    :max_wait: max time to wait for clients to shutdown before returning
    :wait_interval: poll interval to check if all clients have terminated
    """
    LOG.info('Instructing all clients to shutdown')
    with server.state_access() as s:
        s.get_all('/shutdown')
    for _ in range(int(max_wait / wait_interval)):
        time.sleep(wait_interval)
        with server.state_access() as s:
            client_count = len(s.clients)
        if client_count == 0:
            LOG.info('All clients terminated')
            break
    else:
        LOG.warn('Not all clients terminated, shutting down anyway')
Ejemplo n.º 9
0
def handle_event_register_prog(event):
    """
    handle 'register_prog' event
    data must include 'name', 'checksum' and 'data'
    :event: event to handle
    :returns: event result data if event sucessfully handled
    :raises: Exception: if error occurs handling event
    """
    data = event.data['data']
    name = event.data['name']
    checksum = event.data['checksum']
    wakeup_ev = threading.Event()

    def multi_callback_func(event_props):
        wakeup_ev.set()

    def callback_func(client, event_props):
        if event_props['status'] != events.EventStatus.SUCCESS.value:
            raise ValueError('{}: failed to register program'.format(client))
        client.registered_progs.append(checksum)

    with server.state_access() as s:
        user_progs_dir = s.user_progs_dir
        all_hardware = s.all_clients_hardware
    prog_dir = os.path.join(user_progs_dir, checksum)
    data_file = os.path.join(prog_dir, 'data.json')
    os.mkdir(prog_dir)
    with open(data_file, 'w') as fp:
        fp.write(data)
    program = user_prog.UserProg(
        name, checksum, data_file, all_hardware, build_dir=prog_dir)
    program.build_for_server()
    post_data = event.data.copy()
    post_data['send_remote_event'] = True
    with server.state_access() as s:
        s.post_all('/programs', post_data, callback_func=callback_func,
                   multi_callback_func=multi_callback_func)
    # NOTE: timeout for registering program on all nodes set to 10 min
    wakeup_ev.wait(timeout=600)
    LOG.info('Registered user program: %s', program)
    with server.state_access() as s:
        s.registered_progs[checksum] = program
    return program.properties
Ejemplo n.º 10
0
def handle_event_init_runtime(event):
    """
    handle 'init_runtime' event
    data must include:
        - 'runtime_id',
        - 'dataset_enc',
        - 'checksum',
        - 'global_params_enc'
    :event: event to handle
    :returns: event result
    """
    runtime_id = event.data['runtime_id']
    dataset_enc = event.data['dataset_enc']
    prog_checksum = event.data['checksum']
    global_params_enc = event.data['global_params_enc']
    with client.client_access() as c:
        program = c.user_programs[prog_checksum]
    runtime = program.get_new_program_runtime(runtime_id)
    runtime.prepare_datastructures(global_params_enc)
    runtime.load_data(dataset_enc)
    LOG.info('Loaded client program instance')
    return {}
Ejemplo n.º 11
0
    def partition_data(self, data):
        """
        load dataset and partition among clients
        :data: data
        """
        client_uuids = list(self.hardware.keys())
        client_count = len(client_uuids)
        LOG.debug("data size %i", sys.getsizeof(data))
        # FIXME this is a really rough estimate as the final calculation is done
        # after casting to double

        data_generator = self.py_mod.split_data(data)
        LOG.info(self.global_params)
        split_size =  self.global_params[0] // client_count + 1
        LOG.debug("split size %i", split_size)
        post_datasets = {}
        for client_uuid in client_uuids:
            LOG.info("Splitting data")
            # FIXME use hardware scan to discover GPU mem size
            # currently rounded slightly down to avoid overflowing in loop
            # 8G gpu ram size
            # gpu_mem_remaining = 8589934592
            gpu_mem_remaining = 8500000000
            split_remaining = split_size
            data_count = 0

            LOG.info("global_params %s", self.global_params)
            dataset = []
            # subtract params size
            gpu_mem_remaining = (gpu_mem_remaining -
                                 sys.getsizeof(self.global_params))
            try:
                while split_remaining > 0 and gpu_mem_remaining > 0:
                    next_split = next(data_generator)
                    split_remaining = split_remaining - 1
                    gpu_mem_remaining = (gpu_mem_remaining -
                                         sys.getsizeof(next_split))
                    dataset.append(next_split)
                    data_count = data_count + 1
            except StopIteration:
                pass

            dataset_enc = [data_count, dataset]
            self.client_datasets[client_uuid] = dataset_enc
        self._initialize_global_state()
Ejemplo n.º 12
0
        self.user_progs_dir = os.path.join(self.tmpdir, 'user_progs_server')
        os.mkdir(self.user_progs_dir)

    def register_client(self, hardware, client_ip, client_port):
        """
        register a client with the server
        :hardware: hardware info dict
        :client_ip: addr of client
        :client_port: port number for client
        :returns: client uuid
        """
        client_uuid = util.hex_uuid()
        url = 'http://{}:{}'.format(client_ip, client_port)
        self.clients[client_uuid] = ClientState(client_uuid, hardware, url)
<<<<<<< HEAD
        LOG.info('Registered client: %s', self.clients[client_uuid])
        return client_uuid

    def get_all(self, endpoint, params=None, expect_json=True):
=======
        with remote_event.remote_events_access() as r:
            r.register_client(client_uuid)
        self.all_clients_hardware[client_uuid] = hardware
        LOG.info('Registered client: %s', self.clients[client_uuid])
        return client_uuid

    def unregister_client(self, client_uuid):
        """
        unregister a client with the server
        :client_uuid: client uuid
        :returns: client object
Ejemplo n.º 13
0
    client.create_client(args, tmpdir, hardware)
>>>>>>> ef9b13b186c1a356f50a36e78ad91a3ccff76392
    # automatically find available port
    client_port = util.get_free_port()
    # start client api server
    call = functools.partial(
        client.APP.run, debug=False, host='0.0.0.0', port=client_port)
    thread = threading.Thread(target=call)
    thread.daemon = True
    thread.start()
    # register with server
    with client.client_access() as c:
        c.register(client_port)
    # start client worker
    worker = client_worker.ClientWorker()
    LOG.info('starting client worker')
    try:
        worker.run()
    except queue.Empty:
        with client.client_access() as c:
            c.shutdown()
    return 0


def run_server(args, tmpdir):
    """
    entrypoint for server
    :args: parsed cmdline args
    :tmpdir: temporary directory
    :returns: 0 on success
    """
Ejemplo n.º 14
0
def client_item(client_id):
    """
    GET,DELETE /clients/<client_id>: query clients
    :client_id: client uuid
    :returns: flask response
    """
    if request.method == 'GET':
        with server.state_access() as state:
            client = state.clients.get(client_id)
        return (respond_json(client.properties) if client else
                respond_error(404))
    elif request.method == 'DELETE':
        with server.state_access() as state:
<<<<<<< HEAD
            res = state.clients.pop(client_id, None)
            LOG.info('Deleted client: %s', res)
        return Response("ok") if res is not None else respond_error(404)
=======
            client = state.unregister_client(client_id)
        return Response("ok") if client is not None else respond_error(404)


@APP.route('/programs', methods=['GET', 'POST'])
def programs():
    """
    GET,POST /programs: register or list programs
    :returns: flask response
    """
    if request.method == 'POST':
        event_data = request.get_json()
        if not all(n in event_data for n in ('name', 'data', 'checksum')):
Ejemplo n.º 15
0
def handle_event_run_program(event):
    """
    handle 'run_program' eent
    :event: event to handle
    :returns: program result
    :raises: Exception: if error occurs or invalid request
    """
    runtime_id = util.hex_uuid()
    dataset_enc = event.data['dataset_enc']
    prog_checksum = event.data['checksum']
    global_params_enc = event.data['global_params_enc']
    init_path = os.path.join('/runtimes', prog_checksum, runtime_id)
    iterate_path = os.path.join(init_path, 'iterate')
    cleanup_path = os.path.join(init_path, 'cleanup')
    wakeup_ev = threading.Event()

    def multi_callback_wakeup(event_props):
        wakeup_ev.set()

    def runtime_init_callback(client, event_props):
        if event_props['status'] != events.EventStatus.SUCCESS.value:
            raise ValueError('{}: error on prog runtime init'.format(client))

    with server.state_access() as s:
        program = s.registered_progs[prog_checksum]
    if not program.ready:
        raise ValueError('cannot run program, not ready')
    runtime = program.get_new_server_runtime(runtime_id)
    runtime.prepare_datastructures(global_params_enc)
    runtime.partition_data(dataset_enc)
    runtime_init_remote_event_ids = []
    for client_uuid, dataset_enc in runtime.dataset_partitions_encoded.items():
        data = {
            'runtime_id': runtime_id,
            'checksum': prog_checksum,
            'dataset_enc': dataset_enc,
            'global_params_enc': global_params_enc,
            'send_remote_event': True,
        }
        with server.state_access() as s:
            c = s.clients[client_uuid]
            res = c.post(init_path, data, callback_func=runtime_init_callback)
            runtime_init_remote_event_ids.append(res['event_id'])
    with remote_event.remote_events_access() as r:
        r.register_multi_callback(
            runtime_init_remote_event_ids, multi_callback_wakeup)
    wakeup_ev.wait(timeout=300)
    wakeup_ev.clear()
    LOG.info('Runtime initialized for user program: %s', program)
    aggregation_lock = threading.Lock()

    def run_iteration_callback(client, event_props):
        if event_props['status'] != events.EventStatus.SUCCESS.value:
            raise ValueError('{}: error running prog iteration'.format(client))
        with aggregation_lock:
            runtime.aggregate(event_props['result']['aggregation_result_enc'])
    runtime.reset_aggregation_result()
    iteration_count = 0
    while True:
        post_data = {
            'runtime_id': runtime_id,
            'checksum': prog_checksum,
            'global_state_enc': runtime.global_state_encoded,
            'send_remote_event': True,
        }
        with server.state_access() as s:
            s.post_all(
                iterate_path, post_data, callback_func=run_iteration_callback,
                multi_callback_func=multi_callback_wakeup)
        wakeup_ev.wait(timeout=600)
        wakeup_ev.clear()
        runtime.update_global_state()
        runtime.reset_aggregation_result()
        LOG.debug('Completed iteration for user program: %s', program)
        iteration_count = iteration_count + 1
        if runtime.done:
            break

    LOG.info('Cleaning up...')
    def runtime_cleanup_callback(client, event_props):
        if event_props['status'] != events.EventStatus.SUCCESS.value:
            raise ValueError('{}: error on prog runtime clean up'.format(client))
    post_data = {
        'runtime_id': runtime_id,
        'checksum': prog_checksum,
        'send_remote_event': True,
    }
    with server.state_access() as s:
        s.post_all(
            cleanup_path, post_data, callback_func=runtime_cleanup_callback,
            multi_callback_func=multi_callback_wakeup)
    wakeup_ev.wait(timeout=60)
    wakeup_ev.clear()

    LOG.info('Finished running user program: %s %i', program, iteration_count)
    return {
        'end_aggregate': runtime.top_level_aggregate_encoded,
        'end_global_state': runtime.global_state_encoded,
    }