Ejemplo n.º 1
0
def main():
    """
    main entry point
    :returns: 0 on success
    """
    subcmd_handlers = {
        'client': run_client,
        'server': run_server,
        'cluster': run_cluster,
    }

    # get the argument parser
    parser = cli.configure_parser()
    # parse arguments
    args = parser.parse_args()
    # normalize arguments
    args = cli.normalize_args(args)
    if args is None:
        return -1

    # set log level
    LOG.setLevel(logging.DEBUG if args.verbose else logging.INFO)
    LOG.debug('logging system init')
    LOG.debug('running with args: %s', args)

    # create tmpdir and run handler
    with util.TempDir(preserve=args.keep_tmpdir) as tmpdir:
        return subcmd_handlers[args.subcmd](args, tmpdir)
Ejemplo n.º 2
0
 def __enter__(self):
     """
     Create tempdir
     :returns: tempdir path
     """
     self.tempdir = tempfile.mkdtemp(prefix=self.prefix)
     LOG.debug('Using tempdir: %s', self.tempdir)
     return self.tempdir
Ejemplo n.º 3
0
 def unpack(self, item_keys):
     """
     unpack program files and set up build dir structure
     :item_keys: items to unpack
     """
     LOG.debug('Extracting user program code')
     for key in item_keys:
         filename = PROGRAM_SOURCE_FILE_NAMES[key]
         code = self.data['code'][key]
         path = os.path.join(self.build_dir, filename)
         with open(path, 'w') as fp:
             fp.write(code)
Ejemplo n.º 4
0
 def run(self):
     """
     main loop, wait for event, then process
     if shutdown scheduled continue until queue empty
     :returns: does not return, uses exception to end control
     :raises queue.Empty: when shutdown requested and queue empty
     """
     while True:
         event = SERVER_QUEUE.get(block=not self.shutdown_scheduled)
         LOG.debug('received event: %s', event)
         self.handle_event(event)
         LOG.debug('handled event: %s', event)
         SERVER_QUEUE.task_done()
def get_reasonable_block_size(props, size_mult=32):
    """
    get reasonable cuda block size
    :props: gpu properties dict
    :size_mult: block size multiple
    :returns: reasonable block size
    """
    max_reasonable_size = props['max_block_size']
    min_reasonable_size = props['max_sm_threads'] / props['max_sm_blocks']
    avg_reasonable_size = (max_reasonable_size + min_reasonable_size) / 2
    reasonable_block_size = int(avg_reasonable_size/size_mult) * size_mult
    LOG.debug('Using CUDA block size: %s', reasonable_block_size)
    return reasonable_block_size
def scan_hardware(args, tmpdir):
    """
    scan system hardware
    :args: parsed cmdline args
    :tmpdir: temporary directory
    :returns: dict with hardware info
    """
    hardware = {
        'CPU': check_cpus(),
        'GPU': check_gpus(args, tmpdir),
    }
    LOG.debug('hardware scan found: %s', hardware)
    return hardware
Ejemplo n.º 7
0
 def copy_build_files(self, build_files):
     """
     copy build files from data/build_files into build dir
     :build_files: names of files to copy
     """
     LOG.debug('Copying additional build files')
     for build_file in build_files:
         resource_path = os.path.join(
             PROGRAM_DATA_DIRNAME, 'build_files', build_file)
         data = pkgutil.get_data('lizard', resource_path)
         path = os.path.join(self.build_dir, build_file)
         with open(path, 'wb') as fp:
             fp.write(data)
Ejemplo n.º 8
0
    def partition_data(self, data):
        """
        load dataset and partition among clients
        :data: data
        """
        client_uuids = list(self.hardware.keys())
        client_count = len(client_uuids)
        LOG.debug("data size %i", sys.getsizeof(data))
        # FIXME this is a really rough estimate as the final calculation is done
        # after casting to double

        data_generator = self.py_mod.split_data(data)
        LOG.info(self.global_params)
        split_size =  self.global_params[0] // client_count + 1
        LOG.debug("split size %i", split_size)
        post_datasets = {}
        for client_uuid in client_uuids:
            LOG.info("Splitting data")
            # FIXME use hardware scan to discover GPU mem size
            # currently rounded slightly down to avoid overflowing in loop
            # 8G gpu ram size
            # gpu_mem_remaining = 8589934592
            gpu_mem_remaining = 8500000000
            split_remaining = split_size
            data_count = 0

            LOG.info("global_params %s", self.global_params)
            dataset = []
            # subtract params size
            gpu_mem_remaining = (gpu_mem_remaining -
                                 sys.getsizeof(self.global_params))
            try:
                while split_remaining > 0 and gpu_mem_remaining > 0:
                    next_split = next(data_generator)
                    split_remaining = split_remaining - 1
                    gpu_mem_remaining = (gpu_mem_remaining -
                                         sys.getsizeof(next_split))
                    dataset.append(next_split)
                    data_count = data_count + 1
            except StopIteration:
                pass

            dataset_enc = [data_count, dataset]
            self.client_datasets[client_uuid] = dataset_enc
        self._initialize_global_state()
Ejemplo n.º 9
0
    def build(
            self, cuda_bin=None, include_path=None, unpack=True,
            set_compute_level=True):
        """
        set up user program resources and build shared obj
        :cuda_bin: path to cuda tools bin
        :include_path: path to cuda include dir
        :unpack: if true, unpack program json
        :set_compute_level: if true, specify appropriate compute level
        """
        if not self.build_dir or not os.path.isdir(self.build_dir):
            raise ValueError("Build dir not set up")
        if unpack:
            files = ['cuda', 'python', 'header']
            if self.use_c_extention:
                files.append('cpp')
            self.unpack(files)
        build_files = ['Makefile']
        if self.use_c_extention:
            build_files.append('setup.py')
        self.copy_build_files(build_files)
        make_cmd = ['make', '-C', self.build_dir]
        if cuda_bin is not None:
            nvcc_path = os.path.join(cuda_bin, 'nvcc')
            make_cmd.append('NVCC={}'.format(nvcc_path))
        if include_path is not None:
            make_cmd.append('CUDA_L64=-L{}'.format(include_path))
        if set_compute_level:
            flag_value = '-arch={}'.format(self.compute_level)
            make_cmd.append('COMPUTE_LEVEL_FLAG={}'.format(flag_value))
            LOG.debug('Using compute level: %s', flag_value)
        else:
            LOG.warning('Using default compute level, not optimized')
        LOG.debug('Building CUDA shared object')
        util.subp(make_cmd)

        if self.use_c_extention:
            LOG.debug('Building Python wrapper module')

            # XXX
            # FIXME create hardcoded tmp dir used by dynamic linker
            shared_dll = 'user_program_cuda.so'
            tmp_dir = '/tmp/lizard-slayer/'
            pathlib.Path(tmp_dir).mkdir(exist_ok=True)
            for the_file in os.listdir(tmp_dir):
                file_path = os.path.join(tmp_dir, the_file)
                if os.path.isfile(file_path):
                    os.unlink(file_path)
                elif os.path.isdir(file_path):
                    shutil.rmtree(file_path)

            setup_cmd = ['python3', 'setup.py', 'build_ext', '-b', tmp_dir]

            util.subp(setup_cmd, cwd=self.build_dir)
            # copy over the shared library to be found by the linker
            shutil.copyfile(os.path.join(self.build_dir, shared_dll),
                            os.path.join(tmp_dir, shared_dll))
            # FIXME remove path
            sys.path.append(tmp_dir)
            sys.path.append(self.build_dir)
            self.ready = True
        else:
            LOG.debug('No python c extention for user program')
            self.ready = True
Ejemplo n.º 10
0
def handle_event_run_program(event):
    """
    handle 'run_program' eent
    :event: event to handle
    :returns: program result
    :raises: Exception: if error occurs or invalid request
    """
    runtime_id = util.hex_uuid()
    dataset_enc = event.data['dataset_enc']
    prog_checksum = event.data['checksum']
    global_params_enc = event.data['global_params_enc']
    init_path = os.path.join('/runtimes', prog_checksum, runtime_id)
    iterate_path = os.path.join(init_path, 'iterate')
    cleanup_path = os.path.join(init_path, 'cleanup')
    wakeup_ev = threading.Event()

    def multi_callback_wakeup(event_props):
        wakeup_ev.set()

    def runtime_init_callback(client, event_props):
        if event_props['status'] != events.EventStatus.SUCCESS.value:
            raise ValueError('{}: error on prog runtime init'.format(client))

    with server.state_access() as s:
        program = s.registered_progs[prog_checksum]
    if not program.ready:
        raise ValueError('cannot run program, not ready')
    runtime = program.get_new_server_runtime(runtime_id)
    runtime.prepare_datastructures(global_params_enc)
    runtime.partition_data(dataset_enc)
    runtime_init_remote_event_ids = []
    for client_uuid, dataset_enc in runtime.dataset_partitions_encoded.items():
        data = {
            'runtime_id': runtime_id,
            'checksum': prog_checksum,
            'dataset_enc': dataset_enc,
            'global_params_enc': global_params_enc,
            'send_remote_event': True,
        }
        with server.state_access() as s:
            c = s.clients[client_uuid]
            res = c.post(init_path, data, callback_func=runtime_init_callback)
            runtime_init_remote_event_ids.append(res['event_id'])
    with remote_event.remote_events_access() as r:
        r.register_multi_callback(
            runtime_init_remote_event_ids, multi_callback_wakeup)
    wakeup_ev.wait(timeout=300)
    wakeup_ev.clear()
    LOG.info('Runtime initialized for user program: %s', program)
    aggregation_lock = threading.Lock()

    def run_iteration_callback(client, event_props):
        if event_props['status'] != events.EventStatus.SUCCESS.value:
            raise ValueError('{}: error running prog iteration'.format(client))
        with aggregation_lock:
            runtime.aggregate(event_props['result']['aggregation_result_enc'])
    runtime.reset_aggregation_result()
    iteration_count = 0
    while True:
        post_data = {
            'runtime_id': runtime_id,
            'checksum': prog_checksum,
            'global_state_enc': runtime.global_state_encoded,
            'send_remote_event': True,
        }
        with server.state_access() as s:
            s.post_all(
                iterate_path, post_data, callback_func=run_iteration_callback,
                multi_callback_func=multi_callback_wakeup)
        wakeup_ev.wait(timeout=600)
        wakeup_ev.clear()
        runtime.update_global_state()
        runtime.reset_aggregation_result()
        LOG.debug('Completed iteration for user program: %s', program)
        iteration_count = iteration_count + 1
        if runtime.done:
            break

    LOG.info('Cleaning up...')
    def runtime_cleanup_callback(client, event_props):
        if event_props['status'] != events.EventStatus.SUCCESS.value:
            raise ValueError('{}: error on prog runtime clean up'.format(client))
    post_data = {
        'runtime_id': runtime_id,
        'checksum': prog_checksum,
        'send_remote_event': True,
    }
    with server.state_access() as s:
        s.post_all(
            cleanup_path, post_data, callback_func=runtime_cleanup_callback,
            multi_callback_func=multi_callback_wakeup)
    wakeup_ev.wait(timeout=60)
    wakeup_ev.clear()

    LOG.info('Finished running user program: %s %i', program, iteration_count)
    return {
        'end_aggregate': runtime.top_level_aggregate_encoded,
        'end_global_state': runtime.global_state_encoded,
    }