Beispiel #1
0
def test_ols(nrows, ncols, n_parts, fit_intercept, normalize, datatype,
             delayed, cluster):

    client = Client(cluster)

    try:

        def imp():
            import cuml.comm.serialize  # NOQA

        client.run(imp)

        from cuml.dask.linear_model import LinearRegression as cumlOLS_dask

        n_info = 5
        nrows = np.int(nrows)
        ncols = np.int(ncols)
        X, y = make_regression_dataset(datatype, nrows, ncols, n_info)

        X_df, y_df = _prep_training_data(client, X, y, n_parts)

        lr = cumlOLS_dask(fit_intercept=fit_intercept, normalize=normalize)

        lr.fit(X_df, y_df)

        ret = lr.predict(X_df, delayed=delayed)

        error_cuml = mean_squared_error(y, ret.compute().to_pandas().values)

        assert (error_cuml < 1e-6)

    finally:
        client.close()
Beispiel #2
0
def attach_to_cluster(cli_args):
    """Attaches to an existing cluster if available.
    By default, tries to attach to a cluster running on localhost:8786 (dask's default).

    This is currently hardcoded to assume the scheduler is running on port 8787.
    """
    host = cli_args.get("cluster_host")
    port = cli_args.get("cluster_port", "8786")

    if host is not None:
        try:
            content = requests.get(
                "http://" + host +
                ":8787/info/main/workers.html").content.decode("utf-8")
            url = content.split("Scheduler ")[1].split(":" + str(port))[0]
            client = Client(address=f"{url}:{port}")
            print(f"Connected to {url}:{port}")
        except requests.exceptions.ConnectionError as e:
            sys.exit(
                f"Unable to connect to existing dask scheduler dashboard to determine cluster type: {e}"
            )
        except OSError as e:
            sys.exit(f"Unable to create a Dask Client connection: {e}")

    else:
        raise ValueError("Must pass a cluster address to the host argument.")

    def maybe_create_worker_directories(dask_worker):
        worker_dir = dask_worker.local_directory
        if not os.path.exists(worker_dir):
            os.mkdir(worker_dir)

    client.run(maybe_create_worker_directories)
    return client
Beispiel #3
0
def main():
    parser = argparse.ArgumentParser(description="Dask launcher script")
    parser.add_argument("-bokehPort",
                        default=8888,
                        type=int,
                        help="Port for bokeh server")
    parser.add_argument("-nWorkers",
                        default=8,
                        type=int,
                        help="Number of dask workers to launch")
    parser.add_argument("-schedIp",
                        default="localhost",
                        type=str,
                        help="Scheduler IP address needed for dask-worker's")
    parser.add_argument("-schedPort",
                        default=8787,
                        type=int,
                        help="Port for the dask-scheduler")
    parser.add_argument("-module",
                        default=None,
                        type=str,
                        help="Name of the dask-aware module to be executed")
    parser.add_argument("-mpi",
                        default=False,
                        action='store_true',
                        help="Whether this is an MPI run or not")
    args = parser.parse_args()
    if not args.module:
        raise Exception("'-module' is mandatory!")
    de = DaskEnv(args)
    module = __import__(args.module)
    ipAddr = "%s:%d" % (args.schedIp, args.schedPort)
    client = Client(ipAddr)
    client.run(module.run, args.nWorkers, de.mpiServerUri)
def reload_modules_on_workers(url, modulelist=None):
    """Run reload(module) on the items in the modulelist"""
    client = Client(url)
    for mod in modulelist:
        print("reloading %s" % mod)
        client.run(importlib.reload, mod)
        client.run_on_scheduler(importlib.reload, mod)
def main():
    """
    This function represents the main function of your Dask based system.

    In this example it creates a simple set of workers and launches some
    dummy tasks that just create random log messages.
    """
    # Configure the log listener and launch it in a seperate process
    log_listener = LogListener(configure_logging)
    log_listener.start()

    # Launch some Dask workers
    client = Client(threads_per_worker=1, n_workers=10)

    # Run the log configuration code on each work in the Dask cluster
    client.run(worker_logging_configurer, log_listener.queue)

    # Create some dummmy task to run on the workers
    # This is where your core computation would be in a real system.
    tasks = [worker_task(i) for i in range(10)]

    # Launch the work on the cluster
    compute(tasks)

    # This is the end of the core computation and now comes any cleanup code.

    # Stop the log listener and clean up
    log_listener.stop()
def dask_cluster(request):
    cluster = LocalCluster(n_workers=2,
                           threads_per_worker=1,
                           silence_logs=False)

    client = Client(cluster)

    # cluster setup

    def set_blosc_threads():
        from numcodecs import blosc

        blosc.use_threads = False

    log_level_name = request.config.getoption(
        "--redirect-dask-worker-logs-to-stdout")
    level = logging.getLevelName(log_level_name)

    def redirect_logs():
        import logging

        logger = logging.getLogger("pangeo_forge")
        handler = logging.StreamHandler()
        handler.setLevel(level)
        logger.setLevel(level)
        logger.addHandler(handler)

    client.run(set_blosc_threads)
    client.run(redirect_logs)
    client.close()
    del client

    yield cluster

    cluster.close()
def setup_dask_cluster(
        num_workers: int = 0,
        threads_per_worker: int = 1) -> Tuple[Client, LocalCluster]:
    cluster = LocalCluster(n_workers=(num_workers or cpu_count()),
                           threads_per_worker=threads_per_worker)
    client = Client(cluster)
    client.run(init_worker)
    return client, cluster
Beispiel #8
0
def main(args):
    # Set up workers on the local machine
    cluster = LocalCUDACluster(protocol=args.protocol,
                               n_workers=args.n_workers,
                               CUDA_VISIBLE_DEVICES=args.devs)
    client = Client(cluster)

    if args.no_pool_allocator:
        client.run(cudf.set_allocator, "default", pool=False)
    else:
        client.run(cudf.set_allocator, "default", pool=True)

    took_list = []
    for _ in range(args.runs - 1):
        took_list.append(run(args, write_profile=None))
    took_list.append(run(
        args, write_profile=args.profile))  # Only profiling the last run

    # Collect, aggregate, and print peer-to-peer bandwidths
    incoming_logs = client.run(
        lambda dask_worker: dask_worker.incoming_transfer_log)
    bandwidths = defaultdict(list)
    total_nbytes = defaultdict(list)
    for k, L in incoming_logs.items():
        for d in L:
            if d["total"] >= args.ignore_size:
                bandwidths[k, d["who"]].append(d["bandwidth"])
                total_nbytes[k, d["who"]].append(d["total"])
    bandwidths = {(cluster.scheduler.workers[w1].name,
                   cluster.scheduler.workers[w2].name): [
                       "%s/s" % format_bytes(x)
                       for x in numpy.quantile(v, [0.25, 0.50, 0.75])
                   ]
                  for (w1, w2), v in bandwidths.items()}
    total_nbytes = {(
        cluster.scheduler.workers[w1].name,
        cluster.scheduler.workers[w2].name,
    ): format_bytes(sum(nb))
                    for (w1, w2), nb in total_nbytes.items()}

    print("Merge benchmark")
    print("--------------------------")
    print(f"Chunk-size  | {args.chunk_size}")
    print(f"Frac-match  | {args.frac_match}")
    print(f"Ignore-size | {format_bytes(args.ignore_size)}")
    print(f"Protocol    | {args.protocol}")
    print(f"Device(s)   | {args.devs}")
    print("==========================")
    for took in took_list:
        print(f"Total time  | {format_time(took)}")
    print("==========================")
    print("(w1,w2)     | 25% 50% 75% (total nbytes)")
    print("--------------------------")
    for (d1, d2), bw in sorted(bandwidths.items()):
        print("(%02d,%02d)     | %s %s %s (%s)" %
              (d1, d2, bw[0], bw[1], bw[2], total_nbytes[(d1, d2)]))
Beispiel #9
0
def run(pl_conf, logging_init_fn=None):
    start = timer()

    # Initialize local dask cluster
    logger.info('Initializing pipeline tasks for %s workers', pl_conf.n_workers)
    logger.debug('Pipeline configuration: %s', pl_conf)
    cluster = LocalCluster(
        n_workers=pl_conf.n_workers, threads_per_worker=1,
        processes=True, memory_limit=pl_conf.memory_limit
    )
    client = Client(cluster)

    # Split total region + tile indexes to process into separate lists for each worker 
    # (by indexes of those index combinations)
    tiles = pl_conf.region_tiles
    idx_batches = np.array_split(np.arange(len(tiles)), pl_conf.n_workers)

    # Assign gpus to tasks in round-robin fashion
    def get_gpu(i):
        if pl_conf.gpus is None:
            return None
        return pl_conf.gpus[i % len(pl_conf.gpus)]

    # Generate a single task configuration for each worker
    tasks = [
        pl_conf.get_task_config(region_indexes=tiles[idx_batch, 0], tile_indexes=tiles[idx_batch, 1], gpu=get_gpu(i))
        for i, idx_batch in enumerate(idx_batches)
    ]

    logger.info('Starting pipeline for %s tasks', len(tasks))
    logger.debug('Task definitions:\n\t%s', '\n\t'.join([str(t) for t in tasks]))
    try:
        # Passing logging initialization operation, if given, to workers now
        # running in separate processes
        if logging_init_fn:
            client.run(logging_init_fn)

        # Disable the "auto_restart" feature of dask workers which is of no use in this context
        for worker in cluster.workers:
            worker.auto_restart = False

        # Pass tasks to each worker to execute in parallel
        res = client.map(run_pipeline_task, tasks)
        res = [r.result() for r in res]
        if len(res) != len(tasks):
            raise ValueError('Parallel execution returned {} results but {} were expected'.format(len(res), len(tasks)))
        stop = timer()
        if logger.isEnabledFor(logging.DEBUG):
            from scipy.stats import describe
            times = np.concatenate([np.array(t)[2] for t in res], 0)
            logger.debug('Per-tile execution time summary (all in seconds): %s', describe(times))
        logger.info('Pipeline execution completed in %s seconds', stop - start)
    finally:
        client.close()
        cluster.close()
Beispiel #10
0
def attach_to_cluster(cli_args):
    """Attaches to an existing cluster if available.
    By default, tries to attach to a cluster running on localhost:8786 (dask's default).

    This is currently hardcoded to assume the dashboard is running on port 8787.
    """
    host = cli_args.get("cluster_host")
    port = cli_args.get("cluster_port", "8786")

    if host is not None:
        try:
            content = requests.get(
                "http://" + host + ":8787/info/main/workers.html"
            ).content.decode("utf-8")
            url = content.split("Scheduler ")[1].split(":" + str(port))[0]
            client = Client(address=f"{url}:{port}")
            print(f"Connected to {url}:{port}")
        except requests.exceptions.ConnectionError as e:
            sys.exit(
                f"Unable to connect to existing dask scheduler dashboard to determine cluster type: {e}"
            )
        except OSError as e:
            sys.exit(f"Unable to create a Dask Client connection: {e}")

    else:
        raise ValueError("Must pass a cluster address to the host argument.")

    def maybe_create_worker_directories(dask_worker):
        worker_dir = dask_worker.local_directory
        if not os.path.exists(worker_dir):
            os.mkdir(worker_dir)

    client.run(maybe_create_worker_directories)

    # Get ucx config variables
    ucx_config = client.submit(_get_ucx_config).result()
    cli_args.update(ucx_config)

    # Save worker information
    gpu_sizes = ["16GB", "32GB", "40GB"]
    worker_counts = worker_count_info(client, gpu_sizes=gpu_sizes)
    for size in gpu_sizes:
        key = size + "_workers"
        if cli_args.get(key) is not None and cli_args.get(key) != worker_counts[size]:
            print(
                f"Expected {cli_args.get(key)} {size} workers in your cluster, but got {worker_counts[size]}. It can take a moment for all workers to join the cluster. You may also have misconfigred hosts."
            )
            sys.exit(-1)

    cli_args["16GB_workers"] = worker_counts["16GB"]
    cli_args["32GB_workers"] = worker_counts["32GB"]
    cli_args["40GB_workers"] = worker_counts["40GB"]

    return client
def start_local_CUDA_cluster(devices, pool):
    if len(devices) > 1:
        cluster = LocalCUDACluster(
            n_workers=len(devices),
            CUDA_VISIBLE_DEVICES=",".join(str(x) for x in devices),
        )
        client = Client(cluster)
        if pool:
            client.run(_pool)
    elif pool:
        _pool()
    return client
Beispiel #12
0
def main():
    # Setup logging on the main process:
    _start_logging()

    # Start three worker processes on the local machine:
    client = Client(n_workers=3, threads_per_worker=1)

    # Setup Eliot logging on each worker process:
    client.run(_start_logging)

    # Run the Dask computation in the worker processes:
    result = main_computation()
    print("Result:", result)
Beispiel #13
0
def main():
    # Setup logging on the main process:
    _start_logging()

    # Start three worker processes on the local machine:
    client = Client(n_workers=3, threads_per_worker=1)

    # Setup Eliot logging on each worker process:
    client.run(_start_logging)

    # Run the Dask computation in the worker processes:
    result = main_computation()
    print("Result:", result)
Beispiel #14
0
def test_create_rapids_cluster_sync():
    skip_without_credentials()
    cluster = GCPCluster(
        source_image="projects/nv-ai-infra/global/images/packer-1607527229",
        network="dask-gcp-network-test",
        zone="us-east1-c",
        machine_type="n1-standard-1",
        filesystem_size=50,
        ngpus=2,
        gpu_type="nvidia-tesla-t4",
        docker_image="rapidsai/rapidsai:cuda11.0-runtime-ubuntu18.04-py3.8",
        worker_class="dask_cuda.CUDAWorker",
        worker_options={"rmm_pool_size": "15GB"},
        asynchronous=False,
        bootstrap=False,
    )

    cluster.scale(1)

    client = Client(cluster)  # noqa
    client.wait_for_workers(2)

    def gpu_mem():
        from pynvml.smi import nvidia_smi

        nvsmi = nvidia_smi.getInstance()
        return nvsmi.DeviceQuery("memory.free, memory.total")

    results = client.run(gpu_mem)
    for w, res in results.items():
        assert "total" in res["gpu"][0]["fb_memory_usage"].keys()
        print(res)
    cluster.close()
Beispiel #15
0
def install_libraries_on_workers(url, runlist=None):
    """Install libraries if necessary on workers etc.
    e.g. if already on server...
    install_libraries_on_workers('127.0.0.1:8786')
    """

    client = Client(url)

    if runlist is None:
        runlist = [
            'sudo apt-get -y install build-essential', 'pip install -U pip',
            'sudo apt install libgl1-mesa-glx -y', 'conda update scipy -y',
            'pip install git+https://github.com/sods/paramz.git',
            'pip install git+https://github.com/SheffieldML/GPy.git',
            'pip install git+https://github.com/lionfish0/dp4gp.git',
            'conda install dask-searchcv -c conda-forge -y',
            'pip install git+https://github.com/lionfish0/dask_dp4gp.git',
            'pip install numpy', 'conda remove argcomplete -y',
            'pip install git+https://github.com/lionfish0/dialysis_analysis.git --upgrade'
        ]  #, 'conda install python=3.6 -y']

    for item in runlist:
        print("Installing '%s' on workers..." % item)
        res = client.run(os.system, item)
        print(res)
        print("Installing '%s' on scheduler..." % item)
        res = client.run_on_scheduler(os.system, item)
        print(res)
Beispiel #16
0
def send_package_to_dask_workers(directory, scheduler_ip=None, client=None):
    """
	Send a package to all workers

	One of client and scheduler_ip should be given.

	Parameters
	----------
	directory : str
	scheduler_ip : str
		ignored if client is given
	client : dask.distributed.Client

	"""
    from .tar import directory_to_targz_string
    if client is None:
        if scheduler_ip is None:
            raise ValueError("must give scheduler or client")
        from dask.distributed import Client
        if isinstance(scheduler_ip, Client):
            client = scheduler_ip
        elif isinstance(scheduler_ip, str):
            client = Client(f"{scheduler_ip}:8786")
        else:
            raise TypeError("bad scheduler")
    package_name = os.path.basename(directory.rstrip("/").rstrip("\\"))
    s = directory_to_targz_string(directory)
    return client.run(receive_tar_package, s, package_name)
Beispiel #17
0
def set_cluster_client(n_gpus=-1, device_spill_frac=0.8):
    # TODO: Check for any solution. If user calls this function, for the second call the correct recreation will fail.
    # New cluster can be created after 'kernel restart' procedure.
    '''
        device_spill_frac: Spill GPU-Worker memory to host at this limit. Reduce if spilling fails to prevent device memory errors.
        '''
    if os.path.isdir("dask-worker-space"):
        shutil.rmtree('dask-worker-space', ignore_errors=True)
    # Deploy a Single-Machine Multi-GPU Cluster
    if n_gpus == -1:
        nvidia_smi.nvmlInit()
        n_gpus_avail = nvidia_smi.nvmlDeviceGetCount()
        print('\n n_gpus_avail: {}'.format(n_gpus_avail))
        n_gpus = n_gpus_avail
    # Delect devices to place workers
    visible_devices = [i for i in list(range(n_gpus))]
    visible_devices = str(visible_devices)[1:-1]
    #print('visible_devices: {}'.format(visible_devices))

    #TODO: how to reinitialzed cluster
    cluster = LocalCUDACluster(
        protocol="tcp",  # "tcp" or "ucx"
        CUDA_VISIBLE_DEVICES=visible_devices,
        device_memory_limit=device_spill_frac * device_mem_size(kind="total"),
    )
    try:
        # Create the distributed client
        client = Client(cluster)
        display(client)
        print('\n Dashboard avail: http://localhost:8888/proxy/8787/status')

        # Initialize RMM pool on ALL workers
        def _rmm_pool():
            rmm.reinitialize(
                pool_allocator=True,
                initial_pool_size=None,  # Use default size
            )

        client.run(_rmm_pool)
        return client
    except MemoryError:
        print('\n The client is already initialized')
Beispiel #18
0
def test_listener(cluster):

    c = Client(cluster)

    multiple_workers = len(c.scheduler_info()["workers"]) > 1

    # Test only runs when multiple workers are present
    if multiple_workers:

        def build_ucx():
            # Create listener and cache on worker
            get_worker()._callback_invoked = False

            def mock_callback(ep):
                get_worker()._callback_invoked = True

            ucx = UCX.get(mock_callback)

            get_worker()._ucx = ucx
            return get_worker().address, ucx.listener_port()

        ports = c.run(build_ucx)

        def get_endpoints(addr_ports):
            # Create endpoints to all other workers
            ucx = get_worker()._ucx

            for address, port in addr_ports:
                if address != get_worker().address:
                    host, p = parse_host_port(address)
                    ucx.get_endpoint(host, port)

        c.run(get_endpoints, [ap for ap in ports.values()])

        def callback_invoked():
            # Return True if listener callback was invoked
            return get_worker()._callback_invoked

        invoked = c.run(callback_invoked)

        assert all(invoked)
Beispiel #19
0
def initialize_cluster(use_gpu=True, n_cpu=None, n_gpu=-1):
    enable_tcp_over_ucx = True
    enable_nvlink = True
    enable_infiniband = True

    logger.info('Starting dash cluster...')
    if use_gpu:
        initialize.initialize(create_cuda_context=True,
                              enable_tcp_over_ucx=enable_tcp_over_ucx,
                              enable_nvlink=enable_nvlink,
                              enable_infiniband=enable_infiniband)
        if n_gpu == -1:
            n_gpu = get_n_gpus()

        device_list = cuda_visible_devices(1, range(n_gpu)).split(',')
        CUDA_VISIBLE_DEVICES = []
        for device in device_list:
            try:
                CUDA_VISIBLE_DEVICES.append(int(device))
            except ValueError as vex:
                logger.warn(vex)

        logger.info('Using GPUs {} ...'.format(CUDA_VISIBLE_DEVICES))

        cluster = LocalCUDACluster(protocol="ucx",
                                   dashboard_address=':8787',
                                   CUDA_VISIBLE_DEVICES=CUDA_VISIBLE_DEVICES,
                                   enable_tcp_over_ucx=enable_tcp_over_ucx,
                                   enable_nvlink=enable_nvlink,
                                   enable_infiniband=enable_infiniband)
    else:
        logger.info('Using {} CPUs ...'.format(n_cpu))
        cluster = LocalCluster(dashboard_address=':8787',
                               n_workers=n_cpu,
                               threads_per_worker=4)

    client = Client(cluster)
    client.run(cupy.cuda.set_allocator)
    return client
def main():
    #print('XGBOOST_BUILD_DOC is ' + os.environ['XGBOOST_BUILD_DOC'])
    parser = argparse.ArgumentParser("rapidssample")
    parser.add_argument("--data_dir", type=str, help="location of data")
    parser.add_argument("--num_gpu", type=int, help="Number of GPUs to use", default=1)
    parser.add_argument("--part_count", type=int, help="Number of data files to train against", default=2)
    parser.add_argument("--end_year", type=int, help="Year to end the data load", default=2000)
    parser.add_argument("--cpu_predictor", type=str, help="Flag to use CPU for prediction", default='False')
    parser.add_argument('-f', type=str, default='') # added for notebook execution scenarios
    args = parser.parse_args()
    data_dir = args.data_dir
    num_gpu = args.num_gpu
    part_count = args.part_count
    end_year = args.end_year
    cpu_predictor = args.cpu_predictor.lower() in ('yes', 'true', 't', 'y', '1')

    if cpu_predictor:
        print('Training with CPUs require num gpu = 1')
        num_gpu = 1

    print('data_dir = {0}'.format(data_dir))
    print('num_gpu = {0}'.format(num_gpu))
    print('part_count = {0}'.format(part_count))
    #part_count = part_count + 1 # adding one because the usage below is not inclusive
    print('end_year = {0}'.format(end_year))
    print('cpu_predictor = {0}'.format(cpu_predictor))
    
    import subprocess

    cmd = "hostname --all-ip-addresses"
    process = subprocess.Popen(cmd.split(), stdout=subprocess.PIPE)
    output, error = process.communicate()
    IPADDR = str(output.decode()).split()[0]
    
    cluster = LocalCUDACluster(ip=IPADDR,n_workers=num_gpu)
    client = Client(cluster)
    client
    print(client.ncores())

# to download data for this notebook, visit https://rapidsai.github.io/demos/datasets/mortgage-data and update the following paths accordingly
    acq_data_path = "{0}/acq".format(data_dir) #"/rapids/data/mortgage/acq"
    perf_data_path = "{0}/perf".format(data_dir) #"/rapids/data/mortgage/perf"
    col_names_path = "{0}/names.csv".format(data_dir) # "/rapids/data/mortgage/names.csv"
    start_year = 2000
#end_year = 2000 # end_year is inclusive -- converted to parameter
#part_count = 2 # the number of data files to train against -- converted to parameter

    client.run(initialize_rmm_pool)
    client
    print(client.ncores())
# NOTE: The ETL calculates additional features which are then dropped before creating the XGBoost DMatrix.
# This can be optimized to avoid calculating the dropped features.
    print("Reading ...")
    t1 = datetime.datetime.now()
    gpu_dfs = []
    gpu_time = 0
    quarter = 1
    year = start_year
    count = 0
    while year <= end_year:
        for file in glob(os.path.join(perf_data_path + "/Performance_" + str(year) + "Q" + str(quarter) + "*")):
            if count < part_count:
                gpu_dfs.append(process_quarter_gpu(client, col_names_path, acq_data_path, year=year, quarter=quarter, perf_file=file))
                count += 1
                print('file: {0}'.format(file))
                print('count: {0}'.format(count))
        quarter += 1
        if quarter == 5:
            year += 1
            quarter = 1
            
    wait(gpu_dfs)
    t2 = datetime.datetime.now()
    print("Reading time ...")
    print(t2-t1)
    print('len(gpu_dfs) is {0}'.format(len(gpu_dfs)))
    
    client.run(cudf._gdf.rmm_finalize)
    client.run(initialize_rmm_no_pool)
    client
    print(client.ncores())
    dxgb_gpu_params = {
        'nround':            100,
        'max_depth':         8,
        'max_leaves':        2**8,
        'alpha':             0.9,
        'eta':               0.1,
        'gamma':             0.1,
        'learning_rate':     0.1,
        'subsample':         1,
        'reg_lambda':        1,
        'scale_pos_weight':  2,
        'min_child_weight':  30,
        'tree_method':       'gpu_hist',
        'n_gpus':            1, 
        'distributed_dask':  True,
        'loss':              'ls',
        'objective':         'gpu:reg:linear',
        'max_features':      'auto',
        'criterion':         'friedman_mse',
        'grow_policy':       'lossguide',
        'verbose':           True
    }
      
    if cpu_predictor:
        print('Training using CPUs')
        dxgb_gpu_params['predictor'] = 'cpu_predictor'
        dxgb_gpu_params['tree_method'] = 'hist'
        dxgb_gpu_params['objective'] = 'reg:linear'
        
    else:
        print('Training using GPUs')
    
    print('Training parameters are {0}'.format(dxgb_gpu_params))
    
    gpu_dfs = [delayed(DataFrame.from_arrow)(gpu_df) for gpu_df in gpu_dfs[:part_count]]
    gpu_dfs = [gpu_df for gpu_df in gpu_dfs]
    wait(gpu_dfs)
    
    tmp_map = [(gpu_df, list(client.who_has(gpu_df).values())[0]) for gpu_df in gpu_dfs]
    new_map = {}
    for key, value in tmp_map:
        if value not in new_map:
            new_map[value] = [key]
        else:
            new_map[value].append(key)
    
    del(tmp_map)
    gpu_dfs = []
    for list_delayed in new_map.values():
        gpu_dfs.append(delayed(cudf.concat)(list_delayed))
    
    del(new_map)
    gpu_dfs = [(gpu_df[['delinquency_12']], gpu_df[delayed(list)(gpu_df.columns.difference(['delinquency_12']))]) for gpu_df in gpu_dfs]
    gpu_dfs = [(gpu_df[0].persist(), gpu_df[1].persist()) for gpu_df in gpu_dfs]
    
    gpu_dfs = [dask.delayed(xgb.DMatrix)(gpu_df[1], gpu_df[0]) for gpu_df in gpu_dfs]
    gpu_dfs = [gpu_df.persist() for gpu_df in gpu_dfs]
    gc.collect()
    wait(gpu_dfs)
    
    labels = None
    t1 = datetime.datetime.now()
    bst = dxgb_gpu.train(client, dxgb_gpu_params, gpu_dfs, labels, num_boost_round=dxgb_gpu_params['nround'])
    t2 = datetime.datetime.now()
    print("Training time ...")
    print(t2-t1)
    print('str(bst) is {0}'.format(str(bst)))
    print('Exiting script')
# Set up local cluster
client = Client('tcp://127.0.0.1:61980')
client

# Upload the utils.py file, so that the Dask cluster has access to relevant auxiliary functions
client.upload_file(f'{project_path}NeuralNetwork.py')
client.upload_file(f'{project_path}utils.py')
client.upload_file(f'{project_path}search_explore.py')
client.upload_file(f'{project_path}data_processing.py')

# **Problem:** Somehow, all works fine if I initialize the Dask client without specifying the tcp address. But if I specify the one obtained from the Jupyter Lab Dask extension, it returns "ModuleNotFoundError: No module named 'torch'"! Perhaps the Jupyter Lab Dask extension is associated to a different Python environment.
#
# **Solution:** Jupyter Lab must be started from within the desired virtual environment's shell.

client.run(os.getcwd)

# ## Loading data

all_files = glob.glob(f'{data_path}/*.txt')

# +
files_list = []

for filename in all_files:
    df = dd.read_csv(filename, header=0, sep='\t')
    files_list.append(df)

uoreg_df = dd.concat(files_list)
# -
Beispiel #22
0
        rmm_cfg.use_pool_allocator = True
        #rmm_cfg.initial_pool_size = 2<<30 # set to 2GiB. Default is 1/2 total GPU memory
        import cudf
        return cudf._gdf.rmm_initialize()

    def initialize_rmm_no_pool():
        from librmm_cffi import librmm_config as rmm_cfg

        rmm_cfg.use_pool_allocator = False
        import cudf
        return cudf._gdf.rmm_initialize()

    # In[ ]:

    client.run(initialize_rmm_pool)

    # #### Define functions to encapsulate the workflow into a single call

    # In[ ]:


    def run_dask_task(func, **kwargs):
        task = func(**kwargs)
        return task

    def process_quarter_gpu(year=2000, quarter=1, perf_file=""):
        ml_arrays = run_dask_task(delayed(run_gpu_workflow),
                                  quarter=quarter,
                                  year=year,
                                  perf_file=perf_file)
Beispiel #23
0
def attach_to_cluster(config, create_blazing_context=False):
    """Attaches to an existing cluster if available.
    By default, tries to attach to a cluster running on localhost:8786 (dask's default).

    This is currently hardcoded to assume the dashboard is running on port 8787.

    Optionally, this will also create a BlazingContext.
    """
    host = config.get("cluster_host")
    port = config.get("cluster_port", "8786")

    if host is not None:
        try:
            content = requests.get(
                "http://" + host +
                ":8787/info/main/workers.html").content.decode("utf-8")
            url = content.split("Scheduler ")[1].split(":" + str(port))[0]
            client = Client(address=f"{url}:{port}")
            print(f"Connected to {url}:{port}")
        except requests.exceptions.ConnectionError as e:
            sys.exit(
                f"Unable to connect to existing dask scheduler dashboard to determine cluster type: {e}"
            )
        except OSError as e:
            sys.exit(f"Unable to create a Dask Client connection: {e}")

    else:
        raise ValueError("Must pass a cluster address to the host argument.")

    def maybe_create_worker_directories(dask_worker):
        worker_dir = dask_worker.local_directory
        if not os.path.exists(worker_dir):
            os.mkdir(worker_dir)

    client.run(maybe_create_worker_directories)

    # Get ucx config variables
    ucx_config = client.submit(_get_ucx_config).result()
    config.update(ucx_config)

    # Save worker information
    gpu_sizes = ["16GB", "32GB", "40GB"]
    worker_counts = worker_count_info(client, gpu_sizes=gpu_sizes)
    for size in gpu_sizes:
        key = size + "_workers"
        if config.get(
                key) is not None and config.get(key) != worker_counts[size]:
            print(
                f"Expected {config.get(key)} {size} workers in your cluster, but got {worker_counts[size]}. It can take a moment for all workers to join the cluster. You may also have misconfigred hosts."
            )
            sys.exit(-1)

    config["16GB_workers"] = worker_counts["16GB"]
    config["32GB_workers"] = worker_counts["32GB"]
    config["40GB_workers"] = worker_counts["40GB"]

    bc = None
    if create_blazing_context:
        bc = BlazingContext(
            dask_client=client,
            pool=os.environ.get("BLAZING_POOL", False),
            network_interface=os.environ.get("INTERFACE", "ib0"),
            config_options=get_config_options(),
            allocator=os.environ.get("BLAZING_ALLOCATOR_MODE", "managed"),
            initial_pool_size=os.environ.get("BLAZING_INITIAL_POOL_SIZE",
                                             None))

    return client, bc
Beispiel #24
0
def run_preprocessing(input_train_path, workflow_path, output_path,
                      dask_workdir, num_gpus):
    fname = '{}.parquet'
    train_files = [
        i for i in os.listdir(input_train_path)
        if re.match(fname.format('.*'), i) is not None
    ]
    train_paths = [
        os.path.join(input_train_path, filename) for filename in train_files
    ]

    # Deploy a Dask Distributed Cluster
    # Single-Machine Multi-GPU Cluster
    protocol = "tcp"  # "tcp" or "ucx"
    visible_devices = ",".join([str(n) for n in num_gpus
                                ])  # Delect devices to place workers
    device_limit_frac = 0.4  # Spill GPU-Worker memory to host at this limit.
    device_pool_frac = 0.5
    part_mem_frac = 0.05

    # Use total device size to calculate args.device_limit_frac
    device_size = device_mem_size(kind="total")
    part_size = int(part_mem_frac * device_size)
    logging.info(f"Partition size: {part_size}")

    # Deploy Dask Distributed cluster only if asked for multiple GPUs
    if len(num_gpus) > 1:
        logging.info("Deploy Dask Distributed cluster...")

        device_limit = int(device_limit_frac * device_size)
        device_pool_size = int(device_pool_frac * device_size)

        logging.info("Checking if any device memory is already occupied...")
        # Check if any device memory is already occupied
        for dev in visible_devices.split(","):
            fmem = _pynvml_mem_size(kind="free", index=int(dev))
            used = (device_size - fmem) / 1e9
            if used > 1.0:
                warnings.warn(
                    f"BEWARE - {used} GB is already occupied on device {int(dev)}!"
                )

        cluster = None  # (Optional) Specify existing scheduler port
        if cluster is None:
            cluster = LocalCUDACluster(protocol=protocol,
                                       n_workers=len(
                                           visible_devices.split(",")),
                                       CUDA_VISIBLE_DEVICES=visible_devices,
                                       device_memory_limit=device_limit,
                                       local_directory=dask_workdir)

        logging.info("Create the distributed client...")
        # Create the distributed client
        client = Client(cluster)

        logging.info("Initialize memory pools...")

        # Initialize RMM pool on ALL workers
        def _rmm_pool():
            rmm.reinitialize(
                # RMM may require the pool size to be a multiple of 256.
                pool_allocator=True,
                initial_pool_size=(device_pool_size // 256) *
                256,  # Use default size
            )

        client.run(_rmm_pool)

    # Import the test .parquet
    logging.info("Importing Data...")
    test_dataset = nvt.Dataset(train_paths,
                               engine='parquet',
                               part_size=part_size)

    logging.info("Loading workflow object...")
    workflow = nvt.Workflow.load(workflow_path)

    # Specify the columns IDs: this part should exactly the columns while preproc. train, valid datasets
    CONTINUOUS_COLUMNS = ['I' + str(x) for x in range(1, 14)]
    CATEGORICAL_COLUMNS = ['C' + str(x) for x in range(1, 27)]
    LABEL_COLUMNS = ['label']
    dict_dtypes = {}

    for col in CATEGORICAL_COLUMNS:
        dict_dtypes[col] = np.int64

    for col in CONTINUOUS_COLUMNS:
        dict_dtypes[col] = np.float32

    for col in LABEL_COLUMNS:
        dict_dtypes[col] = np.float32

    # Create output directory for test data
    output_test_dir = os.path.join(output_path, 'train/')

    if not os.path.exists(output_test_dir):
        logging.info(f"Creating train/ directory at: {output_test_dir}")
        os.makedirs(output_test_dir)

    logging.info("Preprocessing Data...")
    workflow.transform(test_dataset).to_parquet(output_path=output_test_dir,
                                                dtypes=dict_dtypes,
                                                cats=CATEGORICAL_COLUMNS,
                                                conts=CONTINUOUS_COLUMNS,
                                                labels=LABEL_COLUMNS)

    logging.info("Done!")
def main(args):
    cluster_options = get_cluster_options(args)
    Cluster = cluster_options["class"]
    cluster_args = cluster_options["args"]
    cluster_kwargs = cluster_options["kwargs"]
    scheduler_addr = cluster_options["scheduler_addr"]

    if args.sched_addr:
        client = Client(args.sched_addr)
    else:
        filterwarnings("ignore",
                       message=".*NVLink.*rmm_pool_size.*",
                       category=UserWarning)

        cluster = Cluster(*cluster_args, **cluster_kwargs)
        if args.multi_node:
            import time

            # Allow some time for workers to start and connect to scheduler
            # TODO: make this a command-line argument?
            time.sleep(15)

        client = Client(scheduler_addr if args.multi_node else cluster)

    if args.type == "gpu":
        client.run(
            setup_memory_pool,
            pool_size=args.rmm_pool_size,
            disable_pool=args.disable_rmm_pool,
            log_directory=args.rmm_log_directory,
        )
        # Create an RMM pool on the scheduler due to occasional deserialization
        # of CUDA objects. May cause issues with InfiniBand otherwise.
        client.run_on_scheduler(
            setup_memory_pool,
            pool_size=1e9,
            disable_pool=args.disable_rmm_pool,
            log_directory=args.rmm_log_directory,
        )

    scheduler_workers = client.run_on_scheduler(get_scheduler_workers)
    n_workers = len(scheduler_workers)
    client.wait_for_workers(n_workers)

    if args.all_to_all:
        all_to_all(client)

    took_list = []
    for _ in range(args.runs - 1):
        took_list.append(run(client, args, n_workers, write_profile=None))
    took_list.append(
        run(client, args, n_workers,
            write_profile=args.profile))  # Only profiling the last run

    # Collect, aggregate, and print peer-to-peer bandwidths
    incoming_logs = client.run(
        lambda dask_worker: dask_worker.incoming_transfer_log)
    bandwidths = defaultdict(list)
    total_nbytes = defaultdict(list)
    for k, L in incoming_logs.items():
        for d in L:
            if d["total"] >= args.ignore_size:
                bandwidths[k, d["who"]].append(d["bandwidth"])
                total_nbytes[k, d["who"]].append(d["total"])
    bandwidths = {(scheduler_workers[w1].name, scheduler_workers[w2].name): [
        "%s/s" % format_bytes(x)
        for x in numpy.quantile(v, [0.25, 0.50, 0.75])
    ]
                  for (w1, w2), v in bandwidths.items()}
    total_nbytes = {(
        scheduler_workers[w1].name,
        scheduler_workers[w2].name,
    ): format_bytes(sum(nb))
                    for (w1, w2), nb in total_nbytes.items()}

    t_runs = numpy.empty(len(took_list))
    if args.markdown:
        print("```")
    print("Shuffle benchmark")
    print("-------------------------------")
    print(f"backend        | {args.backend}")
    print(f"partition-size | {format_bytes(args.partition_size)}")
    print(f"in-parts       | {args.in_parts}")
    print(f"protocol       | {args.protocol}")
    print(f"device(s)      | {args.devs}")
    if args.device_memory_limit:
        print(f"memory-limit   | {format_bytes(args.device_memory_limit)}")
    print(f"rmm-pool       | {(not args.disable_rmm_pool)}")
    if args.protocol == "ucx":
        print(f"tcp            | {args.enable_tcp_over_ucx}")
        print(f"ib             | {args.enable_infiniband}")
        print(f"nvlink         | {args.enable_nvlink}")
    print(f"data-processed | {format_bytes(took_list[0][0])}")
    print("===============================")
    print("Wall-clock     | Throughput")
    print("-------------------------------")
    for idx, (data_processed, took) in enumerate(took_list):
        throughput = int(data_processed / took)
        m = format_time(took)
        m += " " * (15 - len(m))
        print(f"{m}| {format_bytes(throughput)}/s")
        t_runs[idx] = float(format_bytes(throughput).split(" ")[0])
    print("===============================")
    if args.markdown:
        print("\n```")

    if args.plot is not None:
        plot_benchmark(t_runs, args.plot, historical=True)

    if args.backend == "dask":
        if args.markdown:
            print(
                "<details>\n<summary>Worker-Worker Transfer Rates</summary>\n\n```"
            )
        print("(w1,w2)        | 25% 50% 75% (total nbytes)")
        print("-------------------------------")
        for (d1, d2), bw in sorted(bandwidths.items()):
            fmt = ("(%s,%s)        | %s %s %s (%s)" if args.multi_node or
                   args.sched_addr else "(%02d,%02d)        | %s %s %s (%s)")
            print(fmt % (d1, d2, bw[0], bw[1], bw[2], total_nbytes[(d1, d2)]))
        if args.markdown:
            print("```\n</details>\n")

    if args.benchmark_json:
        bandwidths_json = {
            "bandwidth_({d1},{d2})_{i}" if args.multi_node or args.sched_addr
            else "(%02d,%02d)_%s" % (d1, d2, i): parse_bytes(v.rstrip("/s"))
            for (d1, d2), bw in sorted(bandwidths.items()) for i, v in zip(
                ["25%", "50%", "75%", "total_nbytes"],
                [bw[0], bw[1], bw[2], total_nbytes[(d1, d2)]],
            )
        }

        with open(args.benchmark_json, "a") as fp:
            for data_processed, took in took_list:
                fp.write(
                    dumps(
                        dict(
                            {
                                "backend": args.backend,
                                "partition_size": args.partition_size,
                                "in_parts": args.in_parts,
                                "protocol": args.protocol,
                                "devs": args.devs,
                                "device_memory_limit":
                                args.device_memory_limit,
                                "rmm_pool": not args.disable_rmm_pool,
                                "tcp": args.enable_tcp_over_ucx,
                                "ib": args.enable_infiniband,
                                "nvlink": args.enable_nvlink,
                                "data_processed": data_processed,
                                "wall_clock": took,
                                "throughput": data_processed / took,
                            },
                            **bandwidths_json,
                        )) + "\n")

    if args.multi_node:
        client.shutdown()
        client.close()
Beispiel #26
0
from confluent_kafka import Consumer, KafkaError
from streamz import Stream
from streamz.dataframe import Random
from streamz.dataframe import DataFrame
import json
from dask.distributed import Client
from time import sleep
import random
from time import time

#dask
client = Client('35.180.242.51:8786')
import os
client.run(lambda: os.system("pip install cassandra-driver"))
print('befor upload')

#streamz
source = Stream.from_kafka(['supramoteur'], {
    'bootstrap.servers': '35.180.242.51:9092',
    'group.id': 'mygroup1'
},
                           loop=client.loop)


def time_final(entry):
    tt = time() - entry['time']
    return tt


def ecrire(entry):
    entry['time_before'] = time()
def main(args):
    cluster_options = get_cluster_options(args)
    Cluster = cluster_options["class"]
    cluster_args = cluster_options["args"]
    cluster_kwargs = cluster_options["kwargs"]
    scheduler_addr = cluster_options["scheduler_addr"]

    if args.sched_addr:
        client = Client(args.sched_addr)
    else:
        filterwarnings("ignore",
                       message=".*NVLink.*rmm_pool_size.*",
                       category=UserWarning)

        cluster = Cluster(*cluster_args, **cluster_kwargs)
        if args.multi_node:
            import time

            # Allow some time for workers to start and connect to scheduler
            # TODO: make this a command-line argument?
            time.sleep(15)

        client = Client(scheduler_addr if args.multi_node else cluster)

    if args.type == "gpu":
        client.run(
            setup_memory_pool,
            pool_size=args.rmm_pool_size,
            disable_pool=args.disable_rmm_pool,
            log_directory=args.rmm_log_directory,
        )
        # Create an RMM pool on the scheduler due to occasional deserialization
        # of CUDA objects. May cause issues with InfiniBand otherwise.
        client.run_on_scheduler(
            setup_memory_pool,
            pool_size=1e9,
            disable_pool=args.disable_rmm_pool,
            log_directory=args.rmm_log_directory,
        )

    scheduler_workers = client.run_on_scheduler(get_scheduler_workers)
    n_workers = len(scheduler_workers)
    client.wait_for_workers(n_workers)

    # Allow the number of chunks to vary between
    # the "base" and "other" DataFrames
    args.base_chunks = args.base_chunks or n_workers
    args.other_chunks = args.other_chunks or n_workers

    if args.all_to_all:
        all_to_all(client)

    took_list = []
    for _ in range(args.runs - 1):
        took_list.append(run(client, args, n_workers, write_profile=None))
    took_list.append(
        run(client, args, n_workers,
            write_profile=args.profile))  # Only profiling the last run

    # Collect, aggregate, and print peer-to-peer bandwidths
    incoming_logs = client.run(
        lambda dask_worker: dask_worker.incoming_transfer_log)
    bandwidths = defaultdict(list)
    total_nbytes = defaultdict(list)
    for k, L in incoming_logs.items():
        for d in L:
            if d["total"] >= args.ignore_size:
                bandwidths[k, d["who"]].append(d["bandwidth"])
                total_nbytes[k, d["who"]].append(d["total"])
    bandwidths = {(scheduler_workers[w1].name, scheduler_workers[w2].name): [
        "%s/s" % format_bytes(x)
        for x in numpy.quantile(v, [0.25, 0.50, 0.75])
    ]
                  for (w1, w2), v in bandwidths.items()}
    total_nbytes = {(
        scheduler_workers[w1].name,
        scheduler_workers[w2].name,
    ): format_bytes(sum(nb))
                    for (w1, w2), nb in total_nbytes.items()}

    broadcast = (False if args.shuffle_join else
                 (True if args.broadcast_join else "default"))

    t_runs = numpy.empty(len(took_list))
    if args.markdown:
        print("```")
    print("Merge benchmark")
    print("-------------------------------")
    print(f"backend        | {args.backend}")
    print(f"merge type     | {args.type}")
    print(f"rows-per-chunk | {args.chunk_size}")
    print(f"base-chunks    | {args.base_chunks}")
    print(f"other-chunks   | {args.other_chunks}")
    print(f"broadcast      | {broadcast}")
    print(f"protocol       | {args.protocol}")
    print(f"device(s)      | {args.devs}")
    print(f"rmm-pool       | {(not args.disable_rmm_pool)}")
    print(f"frac-match     | {args.frac_match}")
    if args.protocol == "ucx":
        print(f"tcp            | {args.enable_tcp_over_ucx}")
        print(f"ib             | {args.enable_infiniband}")
        print(f"nvlink         | {args.enable_nvlink}")
    print(f"data-processed | {format_bytes(took_list[0][0])}")
    print("===============================")
    print("Wall-clock     | Throughput")
    print("-------------------------------")
    for idx, (data_processed, took) in enumerate(took_list):
        throughput = int(data_processed / took)
        m = format_time(took)
        m += " " * (15 - len(m))
        print(f"{m}| {format_bytes(throughput)}/s")
        t_runs[idx] = float(format_bytes(throughput).split(" ")[0])
    print("===============================")
    if args.markdown:
        print("\n```")

    if args.plot is not None:
        plot_benchmark(t_runs, args.plot, historical=True)

    if args.backend == "dask":
        if args.markdown:
            print(
                "<details>\n<summary>Worker-Worker Transfer Rates</summary>\n\n```"
            )
        print("(w1,w2)     | 25% 50% 75% (total nbytes)")
        print("-------------------------------")
        for (d1, d2), bw in sorted(bandwidths.items()):
            fmt = ("(%s,%s)     | %s %s %s (%s)" if args.multi_node
                   or args.sched_addr else "(%02d,%02d)     | %s %s %s (%s)")
            print(fmt % (d1, d2, bw[0], bw[1], bw[2], total_nbytes[(d1, d2)]))
        if args.markdown:
            print("```\n</details>\n")

    if args.multi_node:
        client.shutdown()
        client.close()
Beispiel #28
0
async def run():
    number_of_cores_per_node = 16  # DAS-5 features 2x8 NUMA cores per compute node
    reservation_length = "08:00:00"  # 2 hours is more than enough... probably
    cluster = SLURMCluster(cores=number_of_cores_per_node,
                           memory="64 GB",
                           processes=4,
                           scheduler_options={"dashboard_address": ":6868"},
                           local_directory="./aip-logs",
                           interface='ib0',
                           walltime=reservation_length)

    # Grab 5 execution nodes -> 80 cores
    print("Scaling up, getting 5 nodes")
    cluster.scale_up(5)
    client = Client(cluster)

    print("Client is ready, parsing data files...")

    file_locations = "/var/scratch/lvs215/aip_tmp"
    data_files = []

    # Create a list of all the files we want to parse. Skip the compressed sources if they are still lingering around
    for path, subdirs, files in os.walk(file_locations):
        for name in files:
            if isfile(os.path.join(path, name)) and not name.endswith(
                ("gz", "zip", "tar")):
                data_files.append(os.path.join(path, name))

    client.run(clear_all_files)

    # Create one task per file.
    print(data_files)
    print("Creating and executing tasks...")
    tasks = list(map(delayed(process_file), data_files))
    true_false_array = db.from_delayed(tasks)

    # DEBUG CODE
    # future = client.compute(true_false_array)
    # client.recreate_error_locally(future)

    # Time to compute them!
    start = datetime.datetime.now()
    res = true_false_array.compute()
    end = datetime.datetime.now()
    print(true_false_array)
    print(res)
    print("Tasks ran to completion! Copying databases.")
    if False not in true_false_array:  # If everything went alright, let all nodes copy their databases to the home dir.
        client.run(copy_database_to_home_folder)
        client.run(clear_all_files)
    else:
        print("Parsing one of the files went horribly wrong, quitting!")
        exit(-1)

    print("Beginning assembling of all databases into one!")
    # Now, each of the nodes has a local database file, we will now combine these databases into one.
    # We do this process sequentially, because we are not sure yet if SQLite likes it if all nodes do this in parallel.
    # TODO: test if we can do this procedure in each node through the copy_database_to_home_folder, would save copying data
    database_manager = DatabaseManager(
    )  # This creates an empty aip.db if it doesn't exists.
    con3 = database_manager.db  # Reuse the connection

    # based on https://stackoverflow.com/a/37138506
    os.makedirs(db_files_location, exist_ok=True)
    for file in [
            os.path.join(db_files_location, f)
            for f in os.listdir(db_files_location)
            if isfile(os.path.join(db_files_location, f)) and f.endswith(".db")
    ]:
        con3.execute("ATTACH '{}' as dba".format(file))

        con3.execute("BEGIN")
        for row in con3.execute(
                "SELECT * FROM dba.sqlite_master WHERE type='table'"):
            combine = "INSERT INTO " + row[1] + " SELECT * FROM dba." + row[1]
            print(combine)
            con3.execute(combine)
        con3.execute("detach database dba")
        con3.commit()
        # Now, delete the database as it has been copied.
        # os.remove("{}.db".format(hash(worker)))
    print("All done. Releasing all nodes.")
    await cluster.scale_down(cluster.workers)
    print("Nodes released.")
    print(end - start)
Beispiel #29
0
class LargeELMRegressor(_BaseELM, RegressorMixin):
    """ELM Regressor for larger-than-memory problems.

    Uses `Dask <https://dask.org>`_ for batch analysis of data in Parquet files.

    .. attention:: Why do I need Parquet files?

        Parquet files provide necessary information about the data without loading whole file content from
        disk. It makes a tremendous runtime difference compared to simpler `.csv` or `.json` file formats.
        Reading from files saves memory by loading data in small chunks, supporting arbitrary large input files.
        It also solves current memory leaks with Numpy matrix inputs in Dask.

        Any data format can be easily converted to Parquet, see `Analytical methods <techniques.html>`_ section.

        HDF5 is almost as good as Parquet, but performs worse with Dask due to internal data layout.

    .. todo: Write converters.

    .. todo: Memo about number of workers: one is good, several cover disk read latency but need more memory.
        On one machine matrix operators always run in parallel, do not benefit from Dask.

    .. todo: Memory consumption with large number of neurons - 100,000 neurons require 200GB or swap space, with
        read+write reaching 1GB/s. Suggested a fast SSD, or HDD + extra workers to hide swap latency.
        Mention that Dask is not the perfect solution, kept here for future updates. And it actually solves
        stuff larger than memory, albeit at a very high time+swap cost.

    .. todo: Avoid large batch sizes as workers can fail, safe bet is 2000-5000 range.

    .. todo: Fast HtH and in-place Cholesky solver.

    .. todo: Pro tip in documentation: run ELM with dummy 1000 data samples and 1e+9 regularization,
        This will test possible memory issues for workers without wasting your time on computing full HH.

    .. todo: Option to keep full HH permanently somewhere at disk. Saves before the final step,
        avoids failures from memory issues during Cholesky solver.

    .. todo: GPU + batch Cholesky solver, for both ELM and LargeELM.

    Requirements
    ------------
        * Pandas
        * pyarrow
        * python-snappy

    Parameters
    ----------

    batch_size : int
        Batch size used for both data samples and hidden neurons. With batch Cholesky solver, allows for very large
        numbers of hidden neurons of over 100,000; limited only by the computation time and disk swap space.

        .. hint:: Include bias and original features for best performance.

        ELM will include a bias term (1 extra feature), and the original features with `include_original_features=True`.
        For optimal performance, choose `batch_size` to be equal or evenly divide the
        `n_neurons + 1 (bias) + n_inputs (if include_original_features=True)`.

        .. todo:: Exact batch_size vs. GPU performance
    """
    def __del__(self):
        if hasattr(self, 'client_'):
            self.client_.close()
            self.cluster_.close()

    def _setup_dask_client(self):
        self.cluster_ = LocalCluster(
            n_workers=4,
            threads_per_worker=1,
            local_dir="/Users/akusok/wrkdir/dask-temp",
            memory_limit="8GB")
        self.client_ = Client(self.cluster_)

        W_list = [hl.projection_.components_ for hl in self.hidden_layers_]
        W_dask = [da.from_array(_dense(W), chunks=self.bsize_) for W in W_list]
        self.W_ = self.client_.persist(W_dask)

        def foo():
            import os
            os.environ['OMP_NUM_THREADS'] = '1'

        self.client_.run(foo)

        print("Running on:", self.client_)

        try:
            dashboard = self.client_.scheduler_info()['address'].split(":")
            dashboard[0] = "http"
            dashboard[-1] = str(
                self.client_.scheduler_info()['services']['dashboard'])
            print("Dashboard at", ":".join(dashboard))
        except:
            pass

    def _project(self, X_dask):
        """Compute hidden layer output with Dask functionality.
        """
        H_list = []
        for hl, W in zip(self.hidden_layers_, self.W_):
            if hl.hidden_layer_ == HiddenLayerType.PAIRWISE:
                H0 = X_dask.map_blocks(pairwise_distances,
                                       W,
                                       dtype=X_dask.dtype,
                                       chunks=(X_dask.chunks[0],
                                               (W.shape[0], )),
                                       metric=hl.pairwise_metric)
            else:
                XW_dask = da.dot(X_dask, W.transpose())
                if hl.ufunc_ is dummy:
                    H0 = XW_dask
                elif hl.ufunc_ is np.tanh:
                    H0 = da.tanh(XW_dask)
                else:
                    H0 = XW_dask.map_blocks(hl.ufunc_)
            H_list.append(H0)

        if self.include_original_features:
            H_list.append(X_dask)
        H_list.append(da.ones((X_dask.shape[0], 1)))

        H_dask = da.concatenate(H_list, axis=1).rechunk(self.bsize_)
        return H_dask

    def _compute(self, X, y, sync_every, HH=None, HY=None):
        """Computing matrices HH and HY, the actually long part.

        .. todo: actually distributed computations that scatter batches of data file names,
            and reduce-sum the HH,HY matrices.
        """

        # processing files
        for i, X_file, y_file in zip(range(len(X)), X, y):
            X_dask = dd.read_parquet(X_file).to_dask_array(lengths=True)
            Y_dask = dd.read_parquet(y_file).to_dask_array(lengths=True)
            H_dask = self._project(X_dask)

            if HH is None:  # first iteration
                HH = da.dot(H_dask.transpose(), H_dask)
                HY = da.dot(H_dask.transpose(), Y_dask)
            else:
                HH += da.dot(H_dask.transpose(), H_dask)
                HY += da.dot(H_dask.transpose(), Y_dask)
                if sync_every is not None and i % sync_every == 0:
                    wait([HH, HY])

            # synchronization
            if sync_every is not None and i % sync_every == 0:
                HH, HY = self.client_.persist([HH, HY])

        # finishing solution
        if sync_every is not None:
            wait([HH, HY])
        return HH, HY

    def _solve(self, HH, HY):
        """Compute output weights from HH and HY using Dask functionality.
        """
        # make HH/HY divisible by chunk size
        n_features, _ = HH.shape
        padding = 0
        if n_features > self.bsize_ and n_features % self.bsize_ > 0:
            print("Adjusting batch size {} to n_features {}".format(
                self.bsize_, n_features))
            padding = self.bsize_ - (n_features % self.bsize_)
            P01 = da.zeros((n_features, padding))
            P10 = da.zeros((padding, n_features))
            P11 = da.zeros((padding, padding))
            HH = da.block([[HH, P01], [P10, P11]])

            P1 = da.zeros((padding, HY.shape[1]))
            HY = da.block([[HY], [P1]])

        # rechunk, add bias, and solve
        HH = HH.rechunk(
            self.bsize_) + self.alpha * da.eye(HH.shape[1], chunks=self.bsize_)
        HY = HY.rechunk(self.bsize_)

        B = da.linalg.solve(HH, HY, sym_pos=True)
        if padding > 0:
            B = B[:n_features]

        return B

    def fit(self, X, y=None, sync_every=10):
        """Fits an ELM with data in a bunch of files.

        Model will use the set of features from the first file.
        Same features must have same names across the whole dataset.

        .. todo: Check what happens if features are in different order or missing.

        Does **not** support sparse data.

        .. todo: Check if some sparse data would work.

        .. todo: Check that sync_every does not affect results

        .. todo: Add single precision

        .. todo: Parquet file format examples in documentation

        Original features and bias are added to the end of data, for easier rechunk-merge. This way full chunks
        of hidden neuron outputs stay intact.


        Parameters
        ----------

        X : [str]
            List of input data files in Parquet format.

        y : [str]
            List of target data files in Parquet format.

        sync_every : int or None
            Synchronize computations after this many files are processed. None for running without synchronization.
            Less synchronization improves run speed with smaller data files, but may result in large swap space usage
            for large data problems. Use smaller number for more frequent synchronization if swap space
            becomes a problem.
        """

        if not _is_list_of_strings(X) or not _is_list_of_strings(y):
            raise ValueError("Expected X and y as lists of file names.")

        if len(X) != len(y):
            raise ValueError(
                "Expected X and y as lists of files with the same length. "
                "Got len(X)={} and len(y)={}".format(len(X), len(y)))

        # read first file and get parameters
        X_dask = dd.read_parquet(X[0]).to_dask_array(lengths=True)
        Y_dask = dd.read_parquet(y[0]).to_dask_array(lengths=True)

        n_samples, n_features = X_dask.shape
        if hasattr(self, 'n_features_') and self.n_features_ != n_features:
            raise ValueError(
                'Shape of input is different from what was seen in `fit`')

        _, n_outputs = Y_dask.shape
        if hasattr(self, 'n_outputs_') and self.n_outputs_ != n_outputs:
            raise ValueError(
                'Shape of outputs is different from what was seen in `fit`')

        # set batch size, default is bsize=2000 or all-at-once with less than 10_000 samples
        self.bsize_ = self.batch_size
        if self.bsize_ is None:
            self.bsize_ = n_samples if n_samples < 10 * 1000 else 2000

        # init model if not fit yet
        if not hasattr(self, 'hidden_layers_'):
            self.n_features_ = n_features
            self.n_outputs_ = n_outputs

            X_sample = X_dask[:10].compute()
            self._init_hidden_layers(X_sample)
            self._setup_dask_client()

        HH, HY = self._compute(X, y, sync_every=sync_every)
        self.B = self._solve(HH, HY)
        self.is_fitted_ = True
        return self

    def predict(self, X):
        """Prediction works with both lists of Parquet files and numeric arrays.

        Parameters
        ----------

        X : array-like, [str]
            Input data as list of Parquet files, or as a numeric array.

        Returns
        -------
        Yh : array, shape (n_samples, n_outputs)
            Predicted values for all input samples.

            .. attention:: Returns all outputs as a single in-memory array!

                Danger of running out out memory for high-dimensional outputs, if a large set of input
                files is provided. Feed data in smaller batches in such case.
        """
        check_is_fitted(self, 'is_fitted_')

        if _is_list_of_strings(X):
            Yh_list = []

            # processing files
            for X_file in X:
                X_dask = dd.read_parquet(X_file).to_dask_array(lengths=True)
                H_dask = self._project(X_dask)
                Yh_list.append(da.dot(H_dask, self.B))

            Yh_dask = da.concatenate(Yh_list, axis=0)
            return Yh_dask.compute()

        else:
            X = check_array(X, accept_sparse=True)
            H = [np.ones((X.shape[0], 1))]
            if self.include_original_features:
                H.append(_dense(X))
            H.extend([hl.transform(X) for hl in self.hidden_layers_])

            return np.hstack(H) @ self.B.compute()
def attach_to_cluster(config, create_blazing_context=False):
    """Attaches to an existing cluster if available.
    By default, tries to attach to a cluster running on localhost:8786 (dask's default).

    This is currently hardcoded to assume the dashboard is running on port 8787.

    Optionally, this will also create a BlazingContext.
    """
    scheduler_file = config.get("scheduler_file_path")
    host = config.get("cluster_host")
    port = config.get("cluster_port", "8786")

    if scheduler_file is not None:
        try:
            with open(scheduler_file) as fp:
                print(fp.read())
            client = Client(scheduler_file=scheduler_file)
            print('Connected!')
        except OSError as e:
            sys.exit(f"Unable to create a Dask Client connection: {e}")

    elif host is not None:
        try:
            content = requests.get(
                "http://" + host +
                ":8787/info/main/workers.html").content.decode("utf-8")
            url = content.split("Scheduler ")[1].split(":" + str(port))[0]
            client = Client(address=f"{url}:{port}")
            print(f"Connected to {url}:{port}")
            config["protocol"] = str(url)[0:3]
        except requests.exceptions.ConnectionError as e:
            sys.exit(
                f"Unable to connect to existing dask scheduler dashboard to determine cluster type: {e}"
            )
        except OSError as e:
            sys.exit(f"Unable to create a Dask Client connection: {e}")

    else:
        raise ValueError(
            "Must pass a scheduler file or cluster address to the host argument."
        )

    def maybe_create_worker_directories(dask_worker):
        worker_dir = dask_worker.local_directory
        if not os.path.exists(worker_dir):
            os.mkdir(worker_dir)

    client.run(maybe_create_worker_directories)

    # Get ucx config variables
    ucx_config = client.submit(_get_ucx_config).result()
    config.update(ucx_config)

    # Save worker information
    # Assumes all GPUs are the same size
    expected_workers = config.get("num_workers")
    worker_counts = worker_count_info(client)
    for gpu_size, count in worker_counts.items():
        if count != 0:
            current_workers = worker_counts.pop(gpu_size)
            break

    if expected_workers is not None and expected_workers != current_workers:
        print(
            f"Expected {expected_workers} {gpu_size} workers in your cluster, but got {current_workers}. It can take a moment for all workers to join the cluster. You may also have misconfigred hosts."
        )
        sys.exit(-1)

    config["16GB_workers"] = worker_counts.get("16GB", 0)
    config["32GB_workers"] = worker_counts.get("32GB", 0)
    config["40GB_workers"] = worker_counts.get("40GB", 0)

    bc = None
    if create_blazing_context:
        from blazingsql import BlazingContext
        bc = BlazingContext(
            dask_client=client,
            pool=os.environ.get("BLAZING_POOL", False),
            network_interface=os.environ.get("INTERFACE", "ib0"),
            config_options=get_bsql_config_options(),
            allocator=os.environ.get("BLAZING_ALLOCATOR_MODE", "managed"),
            initial_pool_size=os.environ.get("BLAZING_INITIAL_POOL_SIZE",
                                             None))

    return client, bc
from bokeh.io import show, output_notebook
from dask.distributed import Client
import dask.delayed
import allen_comparison
import time

#output_notebook()
directory = "/home/jdehning/tmp/Emx1-s_highzoom"
#directory = "/scratch.local/jdehning/calcium_ephys_comparison_data/processed_data/Emx1-s_highzoom"
#directory = "/scratch.local/jdehning/calcium_ephys_comparison_data/processed_data/Emx1-s_lowzoom"

#client = Client('localhost:8786')
client = Client('localhost:42747')

client.upload_file('allen_comparison.py')
client.run(importlib.import_module, 'allen_comparison')
futures = []
last_pos = 0


def modify_doc(doc):
    # Set up data
    #ephys, ophys, dt = allen_comparison.open_dir(directory)
    ephys = allen_comparison.open_ephys(directory, client)
    ophys = allen_comparison.open_ophys(directory, client)
    k_arr = np.arange(1, 35)
    sources = []
    plots = []
    for i in range(len(ephys)):
        source1 = ColumnDataSource(data=dict(x=k_arr, y=np.zeros_like(k_arr)))
        source2 = ColumnDataSource(data=dict(x=k_arr, y=np.zeros_like(k_arr)))