Example #1
0
def test_run_dask(fix_task_env):
    import numpy as np
    from dask import delayed as dl
    from dask.distributed import Client
    dc = Client(processes=False)

    input_task_example, gathered_task_example, post_processing_task_example = fix_task_env
    parts = {'a': [0., 1., 2.], 'b': [-3., 10., 2.], 'c': [20.]}
    numpoints = 20
    prefactor = 0.1

    input_delayed = dl(input_task_example)(parts)
    gathered_delayed = dl(gathered_task_example, nout=1)([input_delayed],
                                                         [numpoints])[0]
    post_proc_delayed = dl(post_processing_task_example)(input_delayed,
                                                         gathered_delayed,
                                                         prefactor)
    input_future = dc.compute(input_delayed)
    gathered_future = dc.compute(gathered_delayed)
    post_proc_future = dc.compute(post_proc_delayed)
    input_data = input_future.result()
    gathered_data = gathered_future.result()
    post_proc_data = post_proc_future.result()

    assert input_data == parts
    gather_results = {}
    for part in parts:
        gather_results[part] = np.linspace(0.0, 1.0, numpoints)
    for part in gather_results:
        assert np.all(gathered_data[part] == gather_results[part])
    post_proc_results = 0.0
    for part in parts:
        post_proc_results += prefactor * np.sum(input_data[part]) * \
            np.sum(gather_results[part])
    assert (post_proc_data == post_proc_results)
Example #2
0
class DaskCUDF:
    def __init__(self):
        self._dask = None
        self._client = None
        self._cluster = None

    def create(self, *args, **kwargs):
        logger.print(JUST_CHECKING)
        logger.print("-----")

        logger.print(STARTING_DASK)

        cluster = LocalCUDACluster()
        self._client = Client(cluster, *args, **kwargs)
        self._cluster = cluster

        return self

    def load(self, session):
        self._client = session
        return self

    @property
    def dask(self):
        return self._client

    def compute(self, *args, **kwargs):
        return self._client.compute(*args, **kwargs)

    def info(self):
        return self._client.scheduler_info()

    def cuda_cluster(self):
        return self._cluster
Example #3
0
def test_wait_for_future():
    client = Client(
        processes=False, n_workers=1, threads_per_worker=1, dashboard_address=None
    )
    fut = client.compute(slow_compute(1))
    rr = list(wait_for_future(fut, 0.1))
    assert fut.done()
    assert len(rr) > 1

    # Check that exception doesn't leak out
    fut = client.compute(slow_compute(1, fail=True))
    rr = list(wait_for_future(fut, 0.1))
    assert fut.done()
    assert fut.status == "error"
    assert len(rr) > 1
    print(fut)
Example #4
0
class DaskManager(metaclass=Singleton):
    def __init__(self):
        self.client = Client(
            f'{settings.DASK_SCHEDULER_HOST}:{settings.DASK_SCHEDULER_PORT}')

    def compute(self, graph):
        future = self.client.compute(graph)
        future.add_done_callback(self.task_complete)
        dask_task = DaskTask.objects.create(task_key=future.key)
        return dask_task

    def get_future_status(self, task_key):
        return Future(key=task_key, client=self.client).status

    @staticmethod
    def task_complete(future):
        task = DaskTask.objects.get(pk=future.key)

        if future.status == 'finished':
            task.status = future.status
            task.result = future.result()
            task.save()
        elif future.status == 'error':
            task.status = future.status
            task.result = traceback.extract_tb(
                future.traceback()) + [future.exception()]
            task.save()
            # will cause exception to be thrown here
            future.result()
        else:
            logger.error('Task completed with unhandled status: ' +
                         future.status)
Example #5
0
class Dask:
    def __init__(self):
        self._dask = None
        self._client = None

    def create(self, *args, **kwargs):
        logger.print(JUST_CHECKING)
        logger.print("-----")

        logger.print(STARTING_DASK)

        # Create Dask client
        self._client = Client(*args, **kwargs)

        # Print cluster info
        # self._client.scheduler_info()["workers]

        return self

    def load(self, session):
        self._client = session
        # self._sc = session.sparkContext
        return self


    @property
    def dask(self):
        return self._client

    def compute(self, *args, **kwargs):
        return self._client.compute(*args, **kwargs)
Example #6
0
def test_docker_sweep(fix_task_env):
    import subprocess
    from dask import delayed as dl
    from dask.distributed import Client
    import numpy as np

    # First, set up dask cluster, which for now is just one scheduler and one worker
    # TODO: Make this run in series of docker containers (e.g. with docker-compose)
    scheduler_command = ["dask-scheduler", "--port", "8781", "--no-bokeh"]
    worker_command = [
        "dask-worker",
        "--nthreads",
        "1",
        "--nprocs",
        "1",
        "--no-bokeh",
        "0.0.0.0:8781",
    ]
    processes = []
    try:
        processes.append(subprocess.Popen(scheduler_command))
        processes.append(subprocess.Popen(worker_command))
        client = Client("0.0.0.0:8781")

        # Next, perform the same sweep as before:
        input_task_example, gathered_task_example, post_processing_task_example = (
            fix_task_env)

        delayeds = []
        collected_inputs = []
        for tag1 in np.linspace(0.0, 10.0, 3):
            parts = {
                "a": [tag1, 1.0, 2.0],
                "b": [-3.0, 10.0, 2.0],
                "c": [20.0]
            }
            collected_inputs += [dl(input_task_example)(parts)]
        for tag2 in range(1, 3):
            num_grid_vec = tag2 * np.ones(
                (len(collected_inputs), ), dtype=np.int)
            collected_outputs = dl(gathered_task_example,
                                   nout=3)(collected_inputs, num_grid_vec)
            for i, output in enumerate(collected_outputs):
                input_data = collected_inputs[i]
                for tag3 in np.linspace(-1.0, 1.0, 4):
                    delayeds += [
                        dl(post_processing_task_example)(input_data, output,
                                                         tag3)
                    ]
        results = []
        for obj in delayeds:
            results += [client.compute(obj).result()]
        assert len(results) == 24 and results[3] == 0.0
    finally:
        # Clean up processes
        for p in processes:
            p.kill()
Example #7
0
def test_pagerank():
    gc.collect()
    input_data_path = r"../datasets/hibench_small/1/part-00000.csv"

    # Networkx Call
    pd_df = pd.read_csv(input_data_path, delimiter='\t', names=['src', 'dst'])
    G = nx.DiGraph()
    for i in range(0, len(pd_df)):
        G.add_edge(pd_df['src'][i], pd_df['dst'][i])
    nx_pr = nx.pagerank(G, alpha=0.85)
    nx_pr = sorted(nx_pr.items(), key=lambda x: x[0])

    # Cugraph snmg pagerank Call
    cluster = LocalCUDACluster(threads_per_worker=1)
    client = Client(cluster)

    t0 = time.time()
    chunksize = dcg.get_chunksize(input_data_path)
    ddf = dask_cudf.read_csv(input_data_path,
                             chunksize=chunksize,
                             delimiter='\t',
                             names=['src', 'dst'],
                             dtype=['int32', 'int32'])
    y = ddf.to_delayed()
    x = client.compute(y)
    wait(x)
    t1 = time.time()
    print("Reading Csv time: ", t1 - t0)
    new_ddf = dcg.drop_duplicates(x)
    t2 = time.time()
    pr = dcg.pagerank(new_ddf, alpha=0.85, max_iter=50)
    wait(pr)
    t3 = time.time()
    print("Running PR algo time: ", t3 - t2)
    t4 = time.time()
    res_df = pr.compute()
    t5 = time.time()
    print("Compute time: ", t5 - t4)
    print(res_df)
    t6 = time.time()
    # For bigdatax4, chunksize=100000000 to avoid oom on write csv
    res_df.to_csv('~/pagerank.csv', header=False, index=False)
    t7 = time.time()
    print("Write csv time: ", t7 - t6)

    # Comparison
    err = 0
    tol = 1.0e-05
    for i in range(len(res_df)):
        if (abs(res_df['pagerank'][i] - nx_pr[i][1]) > tol * 1.1):
            err = err + 1
    print("Mismatches:", err)
    assert err < (0.02 * len(res_df))

    client.close()
    cluster.close()
Example #8
0
def process_tasks(tasks: Iterable[Task],
                  proc: TaskProc,
                  client: Client,
                  sink: S3COGSink,
                  check_exists: bool = True,
                  chunked_persist: int = 0,
                  verbose: bool = True) -> Iterator[str]:

    def prep_stage(tasks: Iterable[Task],
                   proc: TaskProc) -> Iterator[Tuple[Union[xr.Dataset, xr.DataArray, None], Task, str]]:
        for task in tasks:
            path = sink.uri(task)
            if check_exists:
                if sink.exists(task):
                    yield (None, task, path)
                    continue

            ds = proc(task)
            yield (ds, task, path)

    in_flight_cogs: Set[Future] = set()
    for ds, task, path in _with_lookahead1(prep_stage(tasks, proc)):
        if ds is None:
            if verbose:
                print(f"..skipping: {path} (exists already)")
            yield path
            continue

        if chunked_persist > 0:
            assert isinstance(ds, xr.DataArray)
            ds = chunked_persist_da(ds, chunked_persist, client)
        else:
            ds = client.persist(ds, fifo_timeout='1ms')

        if len(in_flight_cogs):
            done, in_flight_cogs = drain(in_flight_cogs, 1.0)
            for r in done:
                yield r

        if isinstance(ds, xr.DataArray):
            attrs = ds.attrs.copy()
            ds = ds.to_dataset(dim='band')
            for dv in ds.data_vars.values():
                dv.attrs.update(attrs)

        cog = client.compute(sink.dump(task, ds),
                             fifo_timeout='1ms')
        rr = dask_wait(ds)
        assert len(rr.not_done) == 0
        del ds, rr
        in_flight_cogs.add(cog)

    done, _ = drain(in_flight_cogs)
    for r in done:
        yield r
Example #9
0
def test_tree_reduce_delayed(n_parts, cluster):

    client = Client(cluster)

    func = delayed(sum)

    a = [delayed(i) for i in range(n_parts)]
    b = tree_reduce(a, func=func)
    c = client.compute(b, sync=True)

    assert (sum(range(n_parts)) == c)
Example #10
0
def parallel_write(filename: str, darray: dask.array) -> None:
    """Distribute Zarr writing task to workers using dask.
    Input filename should have extension .zarr"""
    client = Client()
    out = darray.to_zarr(filename, compressor=Blosc(cname='zstd', clevel=3, shuffle=Blosc.BITSHUFFLE),
        compute=False)
    try:
        progress(client) #  I believe this is for visualization purpose.
        fut = client.compute(out)
    except BrokenPipeError:
        print('Process complete (likely)...')
Example #11
0
def test_tree_reduce_futures(n_parts, cluster):

    client = Client(cluster)
    try:

        a = [client.submit(s, i) for i in range(n_parts)]
        b = tree_reduce(a)
        c = client.compute(b, sync=True)

        assert (sum(range(n_parts)) == c)
    finally:
        client.close()
Example #12
0
def test_docker_sweep(fix_task_env, fix_setup_docker):
    import subprocess
    from dask import delayed as dl
    from dask.distributed import Client
    import numpy as np

    # First, set up the docker + dask cluster, which for now is just one scheduler and one worker
    scheduler_command = [
        'dask-scheduler', '--port', '8781', '--bokeh-port', '8780'
    ]
    worker_command = [
        'dask-worker', '--nthreads', '1', '--nprocs', '1', 'localhost:8781'
    ]
    docker_command = ['docker', 'run', '-d', '--network', 'host', 'qmt:master']
    containers = []
    try:
        containers.append(
            subprocess.check_output(docker_command +
                                    scheduler_command).splitlines()[0])
        containers.append(
            subprocess.check_output(docker_command +
                                    worker_command).splitlines()[0])
        client = Client('localhost:8781')

        # Next, perform the same sweep as before:
        input_task_example, gathered_task_example, post_processing_task_example = fix_task_env

        delayeds = []
        collected_inputs = []
        for tag1 in np.linspace(0., 10., 3):
            parts = {'a': [tag1, 1., 2.], 'b': [-3., 10., 2.], 'c': [20.]}
            collected_inputs += [dl(input_task_example)(parts)]
        for tag2 in range(1, 3):
            num_grid_vec = tag2 * np.ones(
                (len(collected_inputs), ), dtype=np.int)
            collected_outputs = dl(gathered_task_example,
                                   nout=3)(collected_inputs, num_grid_vec)
            for i, output in enumerate(collected_outputs):
                input_data = collected_inputs[i]
                for tag3 in np.linspace(-1.0, 1., 4):
                    delayeds += [
                        dl(post_processing_task_example)(input_data, output,
                                                         tag3)
                    ]
        results = []
        for obj in delayeds:
            results += [client.compute(obj).result()]
        assert len(results) == 24 and results[3] == 0.0
    finally:
        # Clean up the docker containers
        for c in containers:
            subprocess.check_output(['docker', 'kill', c])
Example #13
0
def main(src_dir):
    logging.getLogger("tifffile").setLevel(logging.ERROR)
    coloredlogs.install(level="DEBUG",
                        fmt="%(asctime)s %(levelname)s %(message)s",
                        datefmt="%H:%M:%S")

    # assume we have tunnel the scheduler to local
    scheduler = "localhost:8786"
    logger.info(f'connecting to scheduler at "{scheduler}"')
    client = Client(scheduler, timeout="300s")  # 5 min
    print(client)

    src_dir = os.path.abspath(src_dir)

    files = glob.glob(os.path.join(src_dir, "*_predictions.h5"))
    logger.info(f"{len(files)} tile(s) to convert")

    dname = os.path.basename(src_dir)
    dname = dname.rsplit("_", 1)[0]
    dname = f"{dname}_labels"
    dst_dir = os.path.join(os.path.dirname(src_dir), dname)
    create_dir(dst_dir)

    futures = []
    for f in files:
        probabilities = read_h5(f, "predictions")

        label = as_label(probabilities, dtype=np.uint16)

        fname = os.path.basename(f)
        fname, _ = os.path.splitext(fname)
        fname = f"{fname}.tif"
        dst_path = os.path.join(dst_dir, fname)
        dst_path = write_tiff(dst_path, label)

        future = client.compute(dst_path)

        futures.append(future)

    with tqdm(total=len(futures)) as pbar:
        for future in as_completed(futures):
            try:
                dst_path = future.result()
                pbar.set_description(dst_path)
            except Exception as error:
                logger.exception(error)
            finally:
                pbar.update(1)
                del future

    logger.info("closing scheduler connection")
    client.close()
Example #14
0
def connnect_glue():
    # os.system('dask-ssh 128.104.222.{103,104,105,107}')
    # subprocess.call('dask-ssh', '128.104.222.{103,104,105,107}')
    # time.sleep(10)
    import numpy as np

    client = Client('128.104.222.103:8786')
    client.restart()
    x = da.from_zarr('/mnt/cephfs/smltar_numpyarr/zarr_data_full')
    print(x)
    # y = x[0:1]
    # z = x[100:101]
    # m = x[1000:1001]
    # n = x[1500:1501]
    # p = x[1400:1401]

    y = x[0:30]
    z = x[100:130]
    m = x[1000:1030]
    n = x[1500:1530]
    p = x[1400:1430]

    # zc = x[108:208]
    # mc = x[1008:1108]
    # nc = x[1508:1608]
    # pc = x[1601:1701]
    #
    sum = (y + z - m + p) * n
    #
    # sum2 = (zc + mc + nc +pc)*sum
    #
    # sum3 = sum2 + (zc + mc + nc +pc)*sum
    #
    #
    #
    # print(sum2)
    # sum.visualize('sum3')

    # frm = sum[15]
    fu = client.compute(sum)
    # p = r.result()
    # print(type(p))
    # return p
    re = fu.result()

    re = np.array(re)
    np.save("/mnt/cephfs/result/test", re[15])

    # print(p)
    return re[15]
Example #15
0
def code_range(path_dict):
    """
    path_dict = RANGE_CATEGORIES[5]
    """

    # Create a numeric dictionary for these keys
    key_dict = {key: i + 1 for i, key in enumerate(path_dict.keys())}
    number_dict = {k: i for i, k in key_dict.items()}
    vals = key_dict.values()
    combos = [[c for c in combinations(vals, i + 1)] for i in range(len(vals))]
    combos = [c for sc in combos for c in sc]
    combo_keys = {}
    for combo in combos:
        key = "-".join([number_dict[c] for c in combo])
        value = seq_int(combo)
        combo_keys[key] = value

    # Assign each raster a unique value
    arrays = []
    for key, path in path_dict.items():
        value = key_dict[key]
        full_path = DP.join(path)
        array = xr.open_rasterio(full_path, chunks=CHUNKS)[0].data
        array[da.isnan(array)] = 0
        array[array > 0] = value
        arrays.append(array)

    # Stack everything together - we might have to save this a temporary file
    stack = da.stack(arrays, axis=0)
    stack = stack.rechunk((stack.shape[0], 5000, 5000))
    stack = stack[:, 4000:10000, 4000:10000]

    # Try to map the function to each point
    client = Client()
    codes = da.apply_along_axis(seq_int, 0, stack, dtype="uint8")
    future = client.compute(codes)
    result = future.result()
    client.shutdown()
    client.close()

    # Save to temp and delete
    template = rasterio.open(full_path)
    temp_path = DP2.join("test.tif")
    with rasterio.Env():
        profile = template.profile
        profile.update(dtype=rasterio.uint8, count=1, compress='lzw')
        with rasterio.open(temp_path, 'w', **profile) as dst:
            dst.write(result)
Example #16
0
def compute_tasks(tasks: Iterable[Any],
                  client: Client,
                  max_in_flight: int = 3) -> Iterable[Any]:
    """ Parallel compute stream with back pressure.

        Equivalent to:


        .. code-block:: python

            (client.compute(task).result()
              for task in tasks)

        but with up to ``max_in_flight`` tasks being processed at the same time.
        Input/Output order is preserved, so there is a possibility of head of
        line blocking.

        .. note::

              lower limit is 3 concurrent tasks to simplify implementation,
              there is no point calling this function if you want one active
              task and supporting exactly 2 active tasks is not worth the complexity,
              for now. We might special-case 2 at some point.

    """
    # New thread:
    #    1. Take dask task from iterator
    #    2. Submit to client for processing
    #    3. Send it of to wrk_q
    #
    # Calling thread:
    #    1. Pull scheduled future from wrk_q
    #    2. Wait for result of the future
    #    3. yield result to calling code
    from .generic import it2q, qmap

    # (max_in_flight - 2) -- one on each side of queue
    wrk_q = queue.Queue(maxsize=max(1, max_in_flight - 2))  # type: queue.Queue

    # fifo_timeout='0ms' ensures that priority of later tasks is lower
    futures = (client.compute(task, fifo_timeout='0ms') for task in tasks)

    in_thread = threading.Thread(target=it2q, args=(futures, wrk_q))
    in_thread.start()

    yield from qmap(lambda f: f.result(), wrk_q)

    in_thread.join()
Example #17
0
def test_reduce_futures(n_parts, cluster):
    def s(x):
        return x

    client = Client(cluster)

    try:

        a = [client.submit(s, i) for i in range(n_parts)]
        b = reduce(a, sum)
        c = client.compute(b, sync=True)

        # Testing this gets the correct result for now.
        assert (sum(range(n_parts)) == c)
    finally:
        client.close()
def run_dask_compute(h5_main):
    raw_data = h5_main[()]
    #cpu_cores = int(cpu_cores/8)
    #dask_raw_data = da.from_array(raw_data, chunks='auto')
    #cluster = LocalCluster(n_workers=cpu_cores/8)
    #client = Client(cluster, processes=True)
    #map = dask_raw_data.map_blocks(find_all_peaks, [20, 60], num_steps=30)
    #results = map.compute()
    client = Client(processes=False)
    dask_raw_data = client.scatter(raw_data)
    args = [[20, 60]]
    kwargs = {'num_steps': 30}
    L = client.submit(find_all_peaks, dask_raw_data, args, kwargs)
    dask_results = client.compute(L)
    cores = client.ncores()
    client.close()
    return cores
def test_pagerank():
    gc.collect()
    input_data_path = r"datasets/hibench_small/1/part-00000.csv"

    # Networkx Call
    import pandas as pd
    pd_df = pd.read_csv(input_data_path, delimiter='\t', names=['src', 'dst'])
    import networkx as nx
    G = nx.DiGraph()
    for i in range(0, len(pd_df)):
        G.add_edge(pd_df['src'][i], pd_df['dst'][i])
    nx_pr = nx.pagerank(G, alpha=0.85)
    nx_pr = sorted(nx_pr.items(), key=lambda x: x[0])

    # Cugraph snmg pagerank Call
    cluster = LocalCUDACluster(threads_per_worker=1)
    client = Client(cluster)
    import dask_cudf
    import dask_cugraph.pagerank as dcg

    t0 = time.time()
    chunksize = dcg.get_chunksize(input_data_path)
    ddf = dask_cudf.read_csv(input_data_path,
                             chunksize=chunksize,
                             delimiter='\t',
                             names=['src', 'dst'],
                             dtype=['int32', 'int32'])
    y = ddf.to_delayed()
    x = client.compute(y)
    wait(x)
    t1 = time.time()
    print("Reading Csv time: ", t1 - t0)
    pr = dcg.pagerank(x, alpha=0.85, max_iter=50)
    t2 = time.time()
    print("Running PR algo time: ", t2 - t1)
    res_df = pr.compute()

    # Comparison
    err = 0
    tol = 1.0e-05
    for i in range(len(res_df)):
        if (abs(res_df['pagerank'][i] - nx_pr[i][1]) > tol * 1.1):
            err = err + 1
    print("Mismatches:", err)
    assert err < (0.02 * len(res_df))
Example #20
0
def main(src_dir):
    logging.getLogger("tifffile").setLevel(logging.ERROR)
    coloredlogs.install(level="DEBUG",
                        fmt="%(asctime)s %(levelname)s %(message)s",
                        datefmt="%H:%M:%S")

    # assume we have tunnel the scheduler to local
    scheduler = "localhost:8786"
    logger.info(f'connecting to scheduler at "{scheduler}"')
    client = Client(scheduler, timeout="300s")  # 5 min
    print(client)

    src_dir = os.path.abspath(src_dir)

    files = glob.glob(os.path.join(src_dir, "*.tif"))
    logger.info(f"{len(files)} tile(s) to convert")

    dname = os.path.basename(src_dir)
    dname = f"{dname}_h5"
    dst_dir = os.path.join(os.path.dirname(src_dir), dname)
    create_dir(dst_dir)

    # write back
    write_back_tasks = []
    for i, src_path in enumerate(files):
        data = delayed(imageio.volread)(src_path)

        fname = f"tile_{i:04d}.h5"
        dst_path = os.path.join(dst_dir, fname)
        future = pack_arrays(dst_path, data)

        write_back_tasks.append(future)

    # submit task
    futures = client.compute(write_back_tasks, scheduler="processes")
    with tqdm(total=len(futures)) as pbar:
        for future in as_completed(futures):
            pbar.update(1)

    logger.info("closing scheduler connection")
    client.close()
Example #21
0
def test_handles(cluster):

    client = Client(cluster)

    def _has_handle(sessionId):
        return local_handle(sessionId) is not None

    try:
        cb = Comms(verbose=True)
        cb.init()

        dfs = [
            client.submit(_has_handle, cb.sessionId, pure=False, workers=[w])
            for w in cb.worker_addresses
        ]
        wait(dfs, timeout=5)

        assert all(client.compute(dfs, sync=True))

    finally:
        cb.destroy()
        client.close()
def main(threads=1):
    headers = dict(EDP=[0, 1, 2, 3], DM=[0, 1, 2], DV=[0, 1, 2, 3])

    use_dask = threads > 1

    if use_dask:

        log_msg('{} threads requested. Using DASK.'.format(threads))

        from dask.distributed import Client, LocalCluster
        from dask import delayed
        import math

        @delayed
        def read_csv_files(file_list, header):
            return [
                pd.read_csv(fn, header=header, index_col=0) for fn in file_list
            ]

        def read_csv_np(file, header):
            res = np.loadtxt(file, delimiter=',', dtype=str)

            first_row = header[-1] + 1
            data = res[first_row:].T[1:].T
            data[data == ''] = np.nan

            tuples = [tuple(h) for h in res[:first_row].T[1:]]
            MI = pd.MultiIndex.from_tuples(tuples, names=res[:first_row].T[0])

            df = pd.DataFrame(data,
                              columns=MI,
                              index=res[first_row:].T[0],
                              dtype=float)

            return df

        @delayed
        def read_csv_files_np(file_list, header):
            return [read_csv_np(fn, header=header) for fn in file_list]

        cluster = LocalCluster()
        client = Client(cluster)

        log_msg('Cluster initialized.')
        log_msg(client)

    for res_type in ['EDP', 'DM', 'DV']:
        #for res_type in ['EDP', 'DV']:

        log_msg('Loading {} files...'.format(res_type))

        files = glob.glob('./results/{}/*/{}_*.csv'.format(res_type, res_type))
        #files = files[:1000]

        if use_dask:

            file_count = len(files)
            chunk = math.ceil(file_count / threads)
            df_list = []

            for t_i in range(threads):

                if t_i * chunk < file_count - 1:
                    df_list_i = delayed(read_csv_files)(
                        files[t_i * chunk:(t_i + 1) * chunk],
                        headers[res_type])
                    df_i = delayed(pd.concat)(df_list_i, axis=0, sort=False)

                    df_list.append(df_i)

                elif t_i * chunk == file_count - 1:
                    df_i = delayed(read_csv_files)(
                        files[t_i * chunk:(t_i + 1) * chunk],
                        headers[res_type])
                    df_i = df_i[0]

                    df_list.append(df_i)

            df_all = delayed(pd.concat)(df_list, axis=0, sort=False)

            df_all = client.compute(df_all)

            df_all = client.gather(df_all)

        else:
            log_msg('Loading all files')
            df_list = [
                pd.read_csv(resFileName, header=headers[res_type], index_col=0)
                for resFileName in files
            ]

            log_msg('Concatenating all files')
            df_all = pd.concat(df_list, axis=0, sort=False)

        df_all.sort_index(axis=0, inplace=True)

        # save the results
        log_msg('Saving results')
        df_all.index = df_all.index.astype(np.int32)
        df_all.to_hdf('{}.hd5'.format(res_type),
                      'data',
                      mode='w',
                      format='fixed',
                      complevel=1,
                      complib='blosc:snappy')
        #df_all.to_csv('{}.csv'.format(res_type))

    if use_dask:

        log_msg('Closing cluster...')
        cluster.close()
        client.close()

    # aggregate the realizations files
    log_msg('Aggregating individual realizations...')

    files = glob.glob('./results/{}/*/{}_*.hd5'.format('realizations',
                                                       'realizations'))

    log_msg('Number of files: {}'.format(len(files)))

    # get the keys from the first file
    if len(files) > 0:
        first_file = pd.HDFStore(files[0])
        keys = first_file.keys()
        first_file.close()

        for key in keys:
            log_msg('Processing realizations for key {key}'.format(key=key))
            df_list = [pd.read_hdf(resFileName, key) for resFileName in files]

            log_msg('\t\tConcatenating files')
            df_all = pd.concat(df_list, axis=0, sort=False)

            df_all.index = df_all.index.astype(np.int32)

            df_all.sort_index(axis=0, inplace=True)

            df_all.astype(np.float16).to_hdf('realizations.hd5',
                                             key,
                                             mode='a',
                                             format='fixed',
                                             complevel=1,
                                             complib='blosc:snappy')

            log_msg('\t\tResults saved for {key}.'.format(key=key))

    log_msg('End of script')
Example #23
0
        print(("File " + tar_name + " not found in \n" + tar_path))
        sys.exit('File not found.')

    #----make temp-folder for extracting the tar file:------
    temp_folder = tempfile.mkdtemp()
    print(('Temp-folder ' + temp_folder))

    #---uncompress Data:-------------
    print("Now Uncompressing " + tar_name)
    uncompressTGZ(tar_path + tar_name, temp_folder)

    #----calculate cloudiness----------

    #    cc,ASCAtime,cloudmask,scale_factor = cloudiness(temp_folder,client)
    compute_me = delayed(cloudiness)(temp_folder)
    computed = client.compute(compute_me)
    cc, ASCAtime, cloudmask, scale_factor = client.gather(computed)

    #-------Remove temp-folder------
    print("Removing temporaray folder")
    try:
        shutil.rmtree(temp_folder)
        print("Succesfully removed.")
    except:
        print(("Temporary folder {0} could not found or deleted".format(
            temp_folder)))

    #------write to netCDF----------
    print("Writing results to netCDF")
    ncName = "CloudCoverage_" + ASCAtime[0].strftime("%Y%m%d") + ".nc"
Example #24
0
def main(config_path, src_dir):
    logging.getLogger("tifffile").setLevel(logging.ERROR)
    coloredlogs.install(level="DEBUG",
                        fmt="%(asctime)s %(levelname)s %(message)s",
                        datefmt="%H:%M:%S")

    # assume we have tunnel the scheduler to local
    scheduler = "localhost:8786"
    logger.info(f'connecting to scheduler at "{scheduler}"')
    client = Client(scheduler, timeout="300s")  # 5 min
    print(client)

    src_dir = os.path.abspath(src_dir)

    # load dataset
    src_ds = open_dataset(src_dir)
    desc = tuple(f"{k}={v}"
                 for k, v in zip(("x", "y", "z"), reversed(src_ds.tile_shape)))
    logger.info(f"tiling dimension ({', '.join(desc)})")

    # generate tile index list (TODO deal with multi-color/view here)
    def groupby_tiles(inventory, index: List[str]):
        """
        Aggregation function that generates the proper internal list layout for all the tiles in their natural N-D layout.

        Args:
            inventory (pd.DataFrame): the listing inventory
            index (list of str): the column header
        """
        tiles = []
        for _, tile in inventory.groupby(index[0]):
            if len(index) > 1:
                # we are not at the fastest dimension yet, decrease 1 level
                tiles.extend(groupby_tiles(tile, index[1:]))
            else:
                # fastest dimension, call retrieval function
                tiles.append(src_ds[tile])
        return tiles

    index = ["tile_y", "tile_x"]
    if "tile_z" in src_ds.index.names:
        index = ["tile_z"] + index
    logger.info(f"a {len(index)}-D tiled dataset")

    tiles = groupby_tiles(src_ds, index)
    logger.info(f"{len(tiles)} to process")

    # downsample
    tiles_bin4 = [tile[:, ::4, ::4] for tile in tiles]

    dname = os.path.basename(src_dir)
    dname = f"{dname}_bin4"
    dst_dir = os.path.join(os.path.dirname(src_dir), dname)
    create_dir(dst_dir)

    # write back
    write_back_tasks = []
    for i, tile in enumerate(tiles_bin4):
        fname = f"tile_{i:04d}.tif"
        path = os.path.join(dst_dir, fname)
        future = delayed(imageio.volwrite)(path, tile)
        write_back_tasks.append(future)
    futures = client.compute(write_back_tasks, scheduler="processes")

    with tqdm(total=len(futures)) as pbar:
        for future in as_completed(futures):
            print(future.result())
            pbar.update(1)

    logger.info("closing scheduler connection")
    client.close()
Example #25
0
class DaskManager(metaclass=Singleton):
    def __init__(self):
        if settings.DASK_SCHEDULER_HOST is None:
            self.client = Client(preload='daskworkerinit.py',
                                 n_workers=2,
                                 threads_per_worker=1,
                                 memory_limit='4GB')
        else:
            self.client = Client(settings.DASK_SCHEDULER_HOST + ':' +
                                 settings.DASK_SCHEDULER_PORT)

    def compute(self, graph):
        future = self.client.compute(graph)
        future.add_done_callback(self.task_complete)
        dask_task = DaskTask.objects.create(task_key=future.key)
        return dask_task

    def get_future_status(self, task_key):
        return Future(key=task_key, client=self.client).status

    @staticmethod
    def task_complete(future):
        task = DaskTask.objects.get(pk=future.key)

        if future.status == 'finished':
            task.status = future.status
            task.result = future.result()
            task.end_time = timezone.now()
            task.duration_ms = int(
                (task.end_time - task.start_time).total_seconds() * 1000)
            task.save()
        elif future.status == 'error':
            task.status = future.status
            task.result = traceback.extract_tb(
                future.traceback()) + [future.exception()]
            task.end_time = timezone.now()
            task.duration_ms = int(
                (task.end_time - task.start_time).total_seconds() * 1000)
            task.save()
        else:
            logger.error(
                f'Task completed with unhandled status: {future.status}')

        # send email that a task has finished, could be much more complex, just keeping it simple
        if future.status == 'finished':
            # The format requested, I would do a more generic message for all tasks by just passing task id and
            # a preview of the result, but if this is just a micro service for this particular task then it can work
            result = future.result()
            formatted_time = task.end_time.strftime("%d/%m/%Y %H:%M")
            message = f'{formatted_time} - {result["lines"]} entries processed, sum: {result["sum"]}'
        elif future.status == 'error':
            message = f'Task id {task.task_key} failed.'
        else:
            message = f'Task id {task.task_key} completed with unknown status: {future.status}'

        send_mail(
            'Task Completed',
            message,
            settings.DEFAULT_FROM_EMAIL,
            [settings.TASK_INFO_EMAIL],
            fail_silently=False,
        )
Example #26
0
def tsmask_one_iteration(ncpu, mem, block, crs, out_crs, start_of_epoch,
                         end_of_epoch, dirc, loc_str):

    [y1, y2, x1, x2] = block

    #Datacube object

    dc = datacube.Datacube(app='load_clearsentinel')

    tg_ds = tsf.load_s2_nbart_dask(dc, y1, y2, x1, x2, start_of_epoch,
                                   end_of_epoch, {
                                       "time": 1,
                                   }, crs, out_crs)

    memstr = str(mem) + 'GB'

    client = Client(n_workers=ncpu, threads_per_worker=2, memory_limit=memstr)

    client.compute(tg_ds)

    client.close()

    irow = tg_ds['y'].size
    icol = tg_ds['x'].size
    tn = tg_ds['time'].size

    print(tn, irow, icol)

    # Create numpy array to store TSmask results
    tsmask = np.zeros((tn, irow, icol), dtype=np.uint8)

    print("Time series cloud and shadow detection for area (", y1, y2, x1, x2,
          ")")

    # Run time series cloud mask algorithm on the data
    tsmask = tsmask_filter_onearea(tg_ds, ncpu, tsmask)

    print("Begin applying spatial filter")

    results = []

    # number of process for the  pool object
    number_of_workers = ncpu

    # Create a Pool object with a number of processes
    p = Pool(number_of_workers)

    # create a list of scene
    paralist = [tsmask[i, :, :] for i in range(tn)]

    # Start runing the spatial filter function using a pool of indepedent processes
    results = p.map(cym.spatial_filter_v2, paralist)

    # Finish the parallel runs
    p.close()

    # Join the results and put them back in the correct order
    p.join()

    # Save the cloud/shadow masks to the 'tsmask' dataarray in the s2_ds dataset
    for i in range(tn):
        tsmask[i, :, :] = results[i]

    print("Begin calculting long term of the indice set")
    bgids = bg_indices_one_iteration(ncpu, tg_ds, dirc, loc_str, tsmask,
                                     start_of_epoch, end_of_epoch)

    print(bgids.shape)

    # print("Begin creating input features for Nmask ANN model")
    # create_ip_data(tg_ds, bgids, loc_str, dirc)

    tg_ds.close()
Example #27
0
        k: bool(int(v))
        for k, v in read_pairs_list('data/modifiers.txt')
    }
    modifiers_dict[None] = True
    candidates = make_candidates(matched, modifiers_dict.keys())
    print('Number of candidates : {}'.format(len(candidates)))

    if True:  # Dask processing
        cluster = LocalCluster(n_workers=48)
        client = Client(cluster)
        b = db.from_sequence(failed, partition_size=200)
        [c] = client.scatter(
            [candidates],
            broadcast=True)  # Broadcast the list of candidates to the workers
        r = b.map(_fn, c)
        f = client.compute(r)
        progress(f)
        matching_results = f.result()
    else:  # Multiprocessing
        matching_results = []
        with Pool(40) as p:
            for simple_result in tqdm(p.imap(_fn, failed, chunksize=300),
                                      total=len(failed)):
                matching_results.append(simple_result)

    matching_results = sorted(matching_results,
                              key=lambda x: x[1][0][0],
                              reverse=True)

    # normalized_str -> (normalized_name, ({'id':_,'name':_}, modifier_str) )
    matching_dict = {
Example #28
0
    def run(self, current_date: datetime, dry_run: bool = False) -> None:
        """
        Run analysis using mozanalysis for a specific experiment.
        """
        global _dask_cluster
        logger.info("Analysis.run invoked for experiment %s", self.config.experiment.normandy_slug)

        self.check_runnable(current_date)
        assert self.config.experiment.start_date is not None  # for mypy

        self.ensure_enrollments(current_date)

        # set up dask
        _dask_cluster = _dask_cluster or LocalCluster(
            dashboard_address=DASK_DASHBOARD_ADDRESS,
            processes=True,
            threads_per_worker=1,
            n_workers=DASK_N_PROCESSES,
        )
        client = Client(_dask_cluster)

        results = []

        if self.log_config:
            log_plugin = LogPlugin(self.log_config)
            client.register_worker_plugin(log_plugin)

            # add profiling plugins
            # resource_profiling_plugin = ResourceProfilingPlugin(
            #     scheduler=_dask_cluster.scheduler,
            #     project_id=self.log_config.log_project_id,
            #     dataset_id=self.log_config.log_dataset_id,
            #     table_id=self.log_config.task_profiling_log_table_id,
            #     experiment=self.config.experiment.normandy_slug,
            # )
            # _dask_cluster.scheduler.add_plugin(resource_profiling_plugin)

            # task_monitoring_plugin = TaskMonitoringPlugin(
            #     scheduler=_dask_cluster.scheduler,
            #     project_id=self.log_config.log_project_id,
            #     dataset_id=self.log_config.log_dataset_id,
            #     table_id=self.log_config.task_monitoring_log_table_id,
            #     experiment=self.config.experiment.normandy_slug,
            # )
            # _dask_cluster.scheduler.add_plugin(task_monitoring_plugin)

        table_to_dataframe = dask.delayed(self.bigquery.table_to_dataframe)

        for period in self.config.metrics:
            segment_results = []
            time_limits = self._get_timelimits_if_ready(period, current_date)

            if time_limits is None:
                logger.info(
                    "Skipping %s (%s); not ready",
                    self.config.experiment.normandy_slug,
                    period.value,
                )
                continue

            exp = mozanalysis.experiment.Experiment(
                experiment_slug=self.config.experiment.normandy_slug,
                start_date=self.config.experiment.start_date.strftime("%Y-%m-%d"),
                app_id=self._app_id_to_bigquery_dataset(self.config.experiment.app_id),
            )

            analysis_bases = []

            for m in self.config.metrics[period]:
                for analysis_basis in m.metric.analysis_bases:
                    analysis_bases.append(analysis_basis)

            analysis_bases = list(set(analysis_bases))

            if len(analysis_bases) == 0:
                continue

            for analysis_basis in analysis_bases:
                metrics_table = self.calculate_metrics(
                    exp, time_limits, period, analysis_basis, dry_run
                )

                if dry_run:
                    results.append(metrics_table)
                else:
                    metrics_dataframe = table_to_dataframe(metrics_table)

                if dry_run:
                    logger.info(
                        "Not calculating statistics %s (%s); dry run",
                        self.config.experiment.normandy_slug,
                        period.value,
                    )
                    continue

                segment_labels = ["all"] + [s.name for s in self.config.experiment.segments]
                for segment in segment_labels:
                    segment_data = self.subset_to_segment(segment, metrics_dataframe)
                    for m in self.config.metrics[period]:
                        segment_results += self.calculate_statistics(
                            m,
                            segment_data,
                            segment,
                            analysis_basis,
                        ).to_dict()["data"]

                    segment_results += self.counts(segment_data, segment, analysis_basis).to_dict()[
                        "data"
                    ]

            results.append(
                self.save_statistics(
                    period,
                    segment_results,
                    self._table_name(period.value, len(time_limits.analysis_windows)),
                )
            )

        result_futures = client.compute(results)
        client.gather(result_futures)  # block until futures have finished
def beta_parallel_disk_detection(dataset, 
                            probe,
                            #rxmin=None, # these would allow selecting a sub section 
                            #rxmax=None,
                            #rymin=None,
                            #rymax=None,
                            #qxmin=None,
                            #qxmax=None,
                            #qymin=None,
                            #qymax=None,
                            probe_type="FT",
                            dask_client= None,
                            dask_client_params:dict=None,
                            restart_dask_client=True,
                            close_dask_client=False,
                            return_dask_client=True,
                            *args, **kwargs):
    """
    This is not fully validated currently so may not work, please report bugs on the py4DSTEM github page. 

    This parallellises the disk detetection for all probe posistions. This can operate on either in memory or out of memory datasets 
    
    There is an asumption that unless specifying otherwise you are parallelising on a single Local Machine. 
    If this is not the case its probably best to pass the dask_client into the function, although you can just pass the required arguments to dask_client_params.
    If no dask_client arguments are passed it will create a dask_client for a local machine 
    
    Note:
        Do not pass "peaks" argument as a kwarg, like you might in "_find_Bragg_disks_single_DP_FK", as the results will be unreliable and may cause the calculation to crash.
    Args:
        dataset (py4dSTEM datacube): 4DSTEM dataset
        probe (ndarray): can be regular probe kernel or fourier transormed
        probe_type (str): "FT" or None 
        dask_client (distributed.client.Client): dask client
        dask_client_params (dict): parameters to pass to dask client or dask cluster
        restart_dask_client (bool): if True, function will attempt to restart the dask_client.
        close_dask_client (bool): if True, function will attempt to close the dask_client.
        return_dask_client (bool): if True, function will return the dask_client.
        *args,kwargs will be passed to "_find_Bragg_disks_single_DP_FK" e.g. corrPower, sigma, edgeboundary...

    Returns:
        peaks (PointListArray): the Bragg peak positions and the correlenation intensities
        dask_client(optional) (distributed.client.Client): dask_client for use later.
    """
    #TODO add asserts abotu peaks not being passed
    # Dask Client stuff
    #TODO how to guess at default params for client, sqrt no.cores.  Something to do with the size of the diffraction patterm
    # write a function which can do this. 
    #TODO replace dask part with a with statement for easier clean up e.g.
    # with LocalCluser(params) as cluster, Client(cluster) as client: 
    #   ... dask stuff. 
    #TODO add assert statements and other checks. Think about reordering opperations
    
    if dask_client == None: 
        if dask_client_params !=None:

            dask.config.set({'distributed.worker.memory.spill': False,
                'distributed.worker.memory.target': False}) 
            cluster = LocalCluster(**dask_client_params)
            dask_client = Client(cluster, **dask_client_params)
        else:
            # AUTO MAGICALLY SET?
            # LET DASK SET?
            # HAVE A FUNCTION WHICH RUNS ON A SUBSET OF THE DATA TO PICK OPTIMIAL VALUE?
            # psutil could be used to count cores. 
            dask.config.set({'distributed.worker.memory.spill': False, # stops spilling to disk
                'distributed.worker.memory.target': False}) # stops spilling to disk and erroring out
            cluster = LocalCluster()
            dask_client = Client(cluster)

    else:
        assert type(dask_client) == distributed.client.Client
        if restart_dask_client:
            try:
                dask_client.restart()
            except Exception as e:
                print('Could not restart dask client. Try manually restarting outside or passing "restart_dask_client=False"') # WARNING STATEMENT
                return e 
        else:
            pass


    # Probe stuff
    assert (probe.shape == dataset.data.shape[2:]), "Probe and Diffraction Pattern Shapes are Mismatched"
    if probe_type != "FT":
    #TODO clean up and pull out redudant parts
    #if probe.dtype != (np.complex128 or np.complex64 or np.complex256):
        #DO FFT SHIFT THING
        probe_kernel_FT = np.conj(np.fft.fft2(probe))
        dask_probe_array = da.from_array(probe_kernel_FT, chunks=(dataset.Q_Nx, dataset.Q_Ny))
        dask_probe_delayed = dask_probe_array.to_delayed()
        # delayed_probe_kernel_FT = delayed(probe_kernel_FT)
    else:
        probe_kernel_FT = probe
        dask_probe_array = da.from_array(probe_kernel_FT, chunks=(dataset.Q_Nx, dataset.Q_Ny))
        dask_probe_delayed = dask_probe_array.to_delayed()

    # GET DATA 
    #TODO add another elif if it is a dask array then pass
    if type(dataset.data) == np.ndarray:
        dask_data = da.from_array(dataset.data, chunks=(1, 1,dataset.Q_Nx, dataset.Q_Ny))
    elif dataset.stack_pointer != None:
        dask_data = da.from_array(dataset.stack_pointer, chunks=(1, 1,dataset.Q_Nx, dataset.Q_Ny))
    else: 
        print("Couldn't access the data")
        return None

    # Convert the data to delayed 
    dataset_delayed = dask_data.to_delayed()
    # TODO Trim data e.g. rx,ry,qx,qy
    # I can pass the index values in here I should trim the probe and diffraction pattern first


    # Into the meat of the function 
    
    # create an empty list to which we will append the dealyed functions to. 
    res = []
    # loop over the dataset_delayed and create a delayed function of 
    for x in np.ndindex(dataset_delayed.shape):
        temp = delayed(_find_Bragg_disks_single_DP_FK_dask_wrapper)(dataset_delayed[x],
                                probe_kernel_FT=dask_probe_delayed[0,0],
                                #probe_kernel_FT=delayed_probe_kernel_FT,
                                *args, **kwargs) #passing through args from earlier or should I use 
                                #corrPower=corrPower,
                                #sigma=sigma_gaussianFilter,
                                #edgeBoundary=edgeBoundary,
                                #minRelativeIntensity=minRelativeIntensity,
                                #minPeakSpacing=minPeakSpacing,        
                                #maxNumPeaks=maxNumPeaks,
                                #subpixel='poly')
        res.append(temp)
    _temp_peaks = dask_client.compute(res, optimize_graph=True) # creates futures and starts computing 

    output = dask_client.gather(_temp_peaks) # gather the future objects 

    coords = [('qx',float),('qy',float),('intensity',float)]
    peaks = PointListArray(coordinates=coords, shape=dataset.data.shape[:-2])

    #temp_peaks[0][0]

    # operating over a list so we need the size (0->count) and re-create the probe positions (0->rx,0->ry),
    for (count,(rx, ry)) in zip([i for i in range(dataset.data[...,0,0].size)],np.ndindex(dataset.data.shape[:-2])):
        #peaks.get_pointlist(rx, ry).add_pointlist(temp_peaks[0][count])
        #peaks.get_pointlist(rx, ry).add_pointlist(output[count][0])
        peaks.get_pointlist(rx, ry).add_pointlist(output[count])

    # Clean up
    dask_client.cancel(_temp_peaks) # removes from the dask workers
    del _temp_peaks # deletes the object 
    if close_dask_client:
        dask_client.close()
        return peaks
    elif close_dask_client == False and return_dask_client == True:
        return peaks, dask_client
    elif close_dask_client and return_dask_client == False:
        return peaks
    else:
        print('Dask Client in unknown state, this may result in unpredicitable behaviour later')
        return peaks

def add(x, y):
    time.sleep(7)
    return x + y


x = delayed(inc)(1)
y = delayed(dec)(2)
total = delayed(add)(x, y)

# In[6]:

# notice the difference from total.compute()
# notice that this cell completes immediately
fut = c.compute(total)

# In[7]:

c.gather(fut)

# Critically, each futures represents a result held, or being evaluated by the cluster. Thus we can control caching of intermediate values - when a future is no longer referenced, its value is forgotten. For example, although we can explicitly pass data into the cluster using `scatter()`, we normally want to cause the workers to load as much of their own data as possible to avoid excessive communication overhead.
#
# The [full API](http://distributed.readthedocs.io/en/latest/api.html) of the distributed scheduler gives details of interacting with the cluster, which remember, can be on your local machine or possibly on a massive computational resource.

# The futures API offers a work submission style that can easily emulate the map/reduce paradigm (see `c.map()`) that may be familiar to many people. The intermediate results, represented by futures, can be passed to new tasks without having to bring the pull locally from the cluster, and new work can be assigned to work on the output of previous jobs that haven't even begun yet.
#
# Generally, any Dask operation that is executed using `.compute()` can be submitted for asynchronous execution using `c.compute()` instead, and this applies to all collections. Here is an example with the calculation previously seen in the Bag chapter. We have replaced the `.compute()` method there with the distributed client version, so, again, we could continue to submit more work (perhaps based on the result of the calculation), or, in the next cell, follow the progress of the computation. A similar progress-bar appears in the monitoring UI page.

# In[8]: