def eval_parameter_grid(run_ids,
                        job_name,
                        eval_function,
                        parameter_grid,
                        n_gpus=1):
    parameters = list(ParameterGrid(parameter_grid))
    n_parameters_config = len(parameters)
    # eval
    eval_cluster = SLURMCluster(
        cores=1,
        job_cpu=40,
        memory='80GB',
        job_name=job_name,
        walltime='5:00:00',
        interface='ib0',
        job_extra=[
            f'--gres=gpu:{n_gpus}',
            '--qos=qos_gpu-t3',
            '--distribution=block:block',
            '--hint=nomultithread',
            '--output=%x_%j.out',
        ],
        env_extra=[
            'cd $WORK/fastmri-reproducible-benchmark',
            '. ./submission_scripts_jean_zay/env_config.sh',
        ],
    )
    eval_cluster.scale(n_parameters_config)
    client = Client(eval_cluster)
    original_parameters = []
    for params in parameters:
        original_params = {}
        original_params['n_samples'] = params.pop('n_samples', None)
        original_params['loss'] = params.pop('loss', 'mae')
        original_params['fixed_masks'] = params.pop('fixed_masks', False)
        original_parameters.append(original_params)
    futures = [
        client.submit(
            # function to execute
            eval_function,
            run_id=run_id,
            n_samples=50,
            **params,
        ) for run_id, params in zip(run_ids, parameters)
    ]

    for params, original_params, future in zip(parameters, original_parameters,
                                               futures):
        metrics_names, eval_res = client.gather(future)
        params.update(original_params)
        print('Parameters', params)
        print(metrics_names)
        print(eval_res)
    print('Shutting down dask workers')
    client.close()
    eval_cluster.close()
Ejemplo n.º 2
0
def train_eval_dealiasers(contrast='CORPD_FBK', n_epochs=200, n_samples=None, model_name=None, model_size=None, loss='mae'):
    job_name = 'dealiasing_fastmri'
    model_specs = list(get_model_specs(force_res=True, dealiasing=True))
    if model_name is not None:
        model_specs = [ms for ms in model_specs if ms[0] == model_name]
    if model_size is not None:
        model_specs = [ms for ms in model_specs if ms[1] == model_size]
    n_models = len(model_specs)
    train_cluster = SLURMCluster(
        cores=1,
        job_cpu=20,
        memory='80GB',
        job_name=job_name,
        walltime='20:00:00',
        interface='ib0',
        job_extra=[
            f'--gres=gpu:1',
            '--qos=qos_gpu-t3',
            '--distribution=block:block',
            '--hint=nomultithread',
            '--output=%x_%j.out',
        ],
        env_extra=[
            'cd $WORK/fastmri-reproducible-benchmark',
            '. ./submission_scripts_jean_zay/env_config.sh',
        ],
    )
    train_cluster.adapt(minimum_jobs=0, maximum_jobs=n_models)
    client = Client(train_cluster)
    futures = [client.submit(
        # function to execute
        train_dealiaser,
        model_fun=model_fun,
        model_kwargs=kwargs,
        run_id=f'{model_name}_{model_size}',
        n_scales=n_scales,
        contrast=contrast,
        n_epochs=n_epochs,
        n_samples=n_samples,
        loss=loss,
    ) for model_name, model_size, model_fun, kwargs, _, n_scales, _ in model_specs]
    run_ids = client.gather(futures)
    client.close()
    train_cluster.close()
    # eval
    eval_dealiasers(
        run_ids,
        job_name=job_name,
        contrast=contrast,
        n_epochs=n_epochs,
        model_name=model_name,
        model_size=model_size,
        n_samples_train=n_samples,
        loss=loss,
    )
    return run_ids
def eval_parameter_grid(job_name,
                        eval_function,
                        parameter_grid,
                        run_ids,
                        n_samples_eval=None):
    parameters = list(ParameterGrid(parameter_grid))
    n_parameters_config = len(parameters)
    assert n_parameters_config == len(
        run_ids), 'Not enough run ids provided for grid evaluation'
    eval_cluster = SLURMCluster(
        cores=1,
        job_cpu=40,
        memory='60GB',
        job_name=job_name,
        walltime='3:00:00',
        interface='ib0',
        job_extra=[
            f'--gres=gpu:4',
            '--qos=qos_gpu-t3',
            '--distribution=block:block',
            '--hint=nomultithread',
            '--output=%x_%j.out',
        ],
        env_extra=[
            'cd $WORK/understanding-unets',
            '. ./submission_scripts_jean_zay/env_config.sh',
        ],
    )
    eval_cluster.scale(n_parameters_config)
    client = Client(eval_cluster)
    n_samples_list = []
    for params in parameters:
        n_samples = params.pop('n_samples', -1)
        n_samples_list.append(n_samples)
    futures = [
        client.submit(
            # function to execute
            eval_function,
            run_id=run_id,
            n_samples=n_samples_eval,
            **params,
        ) for run_id, params in zip(run_ids, parameters)
    ]

    results = []
    for params, future, n_samples in zip(parameters, futures, n_samples_list):
        metrics_names, eval_res = client.gather(future)
        if n_samples != -1:
            params.update({'n_samples': n_samples})
        results.append((params, eval_res))
    print('Shutting down dask workers')
    client.close()
    eval_cluster.close()
    return metrics_names, results
Ejemplo n.º 4
0
class Cluster:
    def __init__(self):
        print("Start Cluster")
        self.cluster = SLURMCluster(memory='16g',
                                    processes=1,
                                    cores=1,
                                    death_timeout=200,
                                    walltime="168:00:00",
                                    job_extra=['--partition=Sibirien'])
        self.cluster.start_workers(25)
        self.cli = Client(self.cluster.scheduler.address)

    def close(self):
        self.cluster.close()
def train_eval_parameter_grid(job_name,
                              train_function,
                              eval_function,
                              parameter_grid,
                              n_samples_eval=None):
    parameters = list(ParameterGrid(parameter_grid))
    n_parameters_config = len(parameters)
    train_cluster = SLURMCluster(
        cores=1,
        job_cpu=40,
        memory='60GB',
        job_name=job_name,
        walltime='20:00:00',
        interface='ib0',
        job_extra=[
            f'--gres=gpu:4',
            '--qos=qos_gpu-t3',
            '--distribution=block:block',
            '--hint=nomultithread',
            '--output=%x_%j.out',
        ],
        env_extra=[
            'cd $WORK/understanding-unets',
            '. ./submission_scripts_jean_zay/env_config.sh',
        ],
    )
    train_cluster.scale(n_parameters_config)
    client = Client(train_cluster)
    futures = [
        client.submit(
            # function to execute
            train_function,
            **params,
        ) for params in parameters
    ]
    run_ids = client.gather(futures)
    client.close()
    train_cluster.close()
    # eval
    return eval_parameter_grid(
        job_name,
        eval_function,
        parameter_grid,
        run_ids,
        n_samples_eval=n_samples_eval,
    )
def train_eval_parameter_grid(job_name, train_function, eval_function,
                              parameter_grid):
    parameters = list(ParameterGrid(parameter_grid))
    n_parameters_config = len(parameters)
    train_cluster = SLURMCluster(
        cores=1,
        job_cpu=20,
        memory='80GB',
        job_name=job_name,
        walltime='60:00:00',
        interface='ib0',
        job_extra=[
            f'--gres=gpu:1',
            '--qos=qos_gpu-t4',
            '--distribution=block:block',
            '--hint=nomultithread',
            '--output=%x_%j.out',
        ],
        env_extra=[
            'cd $WORK/fastmri-reproducible-benchmark',
            '. ./submission_scripts_jean_zay/env_config.sh',
        ],
    )
    train_cluster.scale(n_parameters_config)
    client = Client(train_cluster)
    futures = [
        client.submit(
            # function to execute
            train_function,
            **params,
        ) for params in parameters
    ]
    run_ids = client.gather(futures)
    client.close()
    train_cluster.close()
    eval_parameter_grid(run_ids, job_name, eval_function, parameter_grid)
def eval_plug_and_play(
    run_ids,
    job_name='eval_pandp',
    contrast='CORPD_FBK',
    n_samples_train=None,
    n_epochs=200,
    af=4,
    n_primal=5,
    train_partition='gpu_p1',
    model_name=None,
    model_size=None,
):
    model_specs = list(get_model_specs(force_res=False, n_primal=n_primal))
    if model_name is not None:
        model_specs = [ms for ms in model_specs if ms[0] == model_name]
    if model_size is not None:
        model_specs = [ms for ms in model_specs if ms[1] == model_size]
    n_models = len(model_specs)
    # eval
    eval_cluster = SLURMCluster(
        cores=1,
        job_cpu=40,
        memory='80GB',
        job_name=job_name,
        walltime='2:00:00',
        interface='ib0',
        job_extra=[
            f'--gres=gpu:1',
            '--qos=qos_gpu-t3',
            '--distribution=block:block',
            '--hint=nomultithread',
            '--output=%x_%j.out',
        ],
        env_extra=[
            'cd $WORK/fastmri-reproducible-benchmark',
            '. ./submission_scripts_jean_zay/env_config.sh',
        ],
    )
    eval_cluster.adapt(minimum_jobs=0, maximum_jobs=n_models)
    client = Client(eval_cluster)

    futures = [
        client.submit(
            # function to execute
            evaluate_xpdnet,
            model_fun=model_fun,
            model_kwargs=kwargs,
            run_id=run_id,
            multicoil=False,
            n_samples=50,
            contrast=contrast,
            af=af,
            n_epochs=n_epochs,
            n_scales=n_scales,
            res=res,
        ) for run_id, (_, _, model_fun, kwargs, _, n_scales,
                       res) in zip(run_ids, model_specs)
    ]

    df_results = pd.DataFrame(
        columns='model_name model_size psnr ssim'.split())

    for (name, model_size, _, _, _, _, _), future in zip(model_specs, futures):
        _, eval_res = client.gather(future)
        df_results = df_results.append(dict(
            model_name=name,
            model_size=model_size,
            psnr=eval_res[0],
            ssim=eval_res[1],
        ),
                                       ignore_index=True)

    print(df_results)
    outputs_file = f'reconstruction_results_{n_samples_train}.csv'
    if model_name is not None:
        outputs_file = f'reconstruction_results_{n_samples_train}_{model_name}.csv'
    df_results.to_csv(outputs_file)
    print('Shutting down dask workers')
    client.close()
    eval_cluster.close()
    return run_ids
Ejemplo n.º 8
0
import logging, time
import xarray as xr
from dask.distributed import Client
from typing import List, Optional, Tuple, Dict, Any
from dask_jobqueue import SLURMCluster
logging.basicConfig(format='%(levelname)s:%(message)s', level=logging.DEBUG)

variable = "tas"
uri = 'https://dataserver.nccs.nasa.gov/thredds/dodsC/bypass/CREATE-IP/reanalysis/MERRA2/mon/atmos/tas.ncml'

cluster = SLURMCluster(queue="myNodes")
cluster.adapt(minimum=1, maximum=4, interval="2s", wait_count=500)
print("CLUSTER JOB SCRIPT: " + cluster.job_script())
client = Client(cluster)

t0 = time.time()
dset: xr.Dataset = xr.open_dataset(uri)
da: xr.DataArray = dset['tas']
da2: xr.DataArray = da.groupby('time.month').mean('time')
da_monthly = da2.load()
print(da_monthly)
print(" Completed computation in " + str(time.time() - t0) + " seconds")
client.close()
cluster.close()
def full_pipeline_dask(job_name, train_function, eval_function, infer_function,
                       **kwargs):
    # original training
    if os.environ.get('FASTMRI_DEBUG'):
        n_epochs_train = 1
        n_epochs_fine_tune = 1
        n_eval_samples = 1
        n_inference_samples = 1
    else:
        n_epochs_train = 250
        n_epochs_fine_tune = 50
        n_eval_samples = 50
        n_inference_samples = None
    train_cluster = SLURMCluster(
        cores=1,
        job_cpu=20,
        memory='80GB',
        job_name=job_name,
        walltime='100:00:00',
        interface='ib0',
        job_extra=[
            f'--gres=gpu:1',
            '--qos=qos_gpu-t4',
            '--distribution=block:block',
            '--hint=nomultithread',
            '--output=%x_%j.out',
        ],
        env_extra=[
            'cd $WORK/fastmri-reproducible-benchmark',
            '. ./submission_scripts_jean_zay/env_config.sh',
        ],
    )
    train_cluster.scale(2)
    client = Client(train_cluster)
    acceleration_factors = [4, 8]
    futures = [
        client.submit(
            # function to execute
            train_function,
            af=af,
            n_epochs=n_epochs_train,
            **kwargs,
            # this function has potential side effects
            pure=True,
        ) for af in acceleration_factors
    ]
    run_ids = client.gather(futures)
    client.close()
    train_cluster.close()
    # fine tuning
    fine_tuning_cluster = SLURMCluster(
        cores=1,
        job_cpu=20,
        memory='80GB',
        job_name=job_name,
        walltime='20:00:00',
        interface='ib0',
        job_extra=[
            f'--gres=gpu:1',
            '--qos=qos_gpu-t3',
            '--distribution=block:block',
            '--hint=nomultithread',
            '--output=%x_%j.out',
        ],
        env_extra=[
            'cd $WORK/fastmri-reproducible-benchmark',
            '. ./submission_scripts_jean_zay/env_config.sh',
        ],
    )
    fine_tuning_cluster.scale(4)
    client = Client(fine_tuning_cluster)
    contrasts = ['CORPDFS_FBK', 'CORPD_FBK']
    futures = []
    for af, run_id in zip(acceleration_factors, run_ids):
        for contrast in contrasts:
            futures += [
                client.submit(
                    # function to execute
                    train_function,
                    af=af,
                    contrast=contrast,
                    original_run_id=run_id,
                    n_epochs=n_epochs_fine_tune,
                    **kwargs,
                    # this function has potential side effects
                    pure=True,
                )
            ]
    fine_tuned_run_ids = client.gather(futures)
    client.close()
    fine_tuning_cluster.close()
    # inference and eval
    inference_eval_cluster = SLURMCluster(
        cores=1,
        job_cpu=40,
        memory='80GB',
        job_name=job_name,
        walltime='20:00:00',
        interface='ib0',
        job_extra=[
            f'--gres=gpu:4',
            '--qos=qos_gpu-t3',
            '--distribution=block:block',
            '--hint=nomultithread',
            '--output=%x_%j.out',
        ],
        env_extra=[
            'cd $WORK/fastmri-reproducible-benchmark',
            '. ./submission_scripts_jean_zay/env_config.sh',
        ],
    )
    inference_eval_cluster.scale(8)
    client = Client(inference_eval_cluster)
    i_run_id = 0
    inference_futures = []
    eval_futures = []
    kwargs.pop('loss')
    for af in acceleration_factors:
        for contrast in contrasts:
            run_id = fine_tuned_run_ids[i_run_id]
            inference_futures += [
                client.submit(
                    # function to execute
                    infer_function,
                    contrast=contrast,
                    af=af,
                    run_id=run_id,
                    n_epochs=n_epochs_fine_tune,
                    n_samples=n_inference_samples,
                    exp_id=job_name,
                    **kwargs,
                    # this function has potential side effects
                    pure=True,
                )
            ]
            eval_futures += [
                client.submit(
                    # function to execute
                    eval_function,
                    contrast=contrast,
                    af=af,
                    run_id=run_id,
                    n_epochs=n_epochs_fine_tune,
                    n_samples=n_eval_samples,
                    **kwargs,
                    # this function has potential side effects
                    pure=True,
                )
            ]
            i_run_id += 1
    client.gather(inference_futures)
    # eval printing
    i_run_id = 0
    for af in acceleration_factors:
        for contrast in contrasts:
            metrics_names, eval_res = client.gather(eval_futures[i_run_id])
            print('AF', af)
            print('Contrast', contrast)
            print(metrics_names)
            print(eval_res)
            i_run_id += 1
    print('Shutting down dask workers')
    client.close()
    inference_eval_cluster.close()
Ejemplo n.º 10
0
class dask_controller:  #adapted from Charles' code
    def __init__(self,n_workers=6,local=True,queue="short",death_timeout=3.,\
                 walltime='01:30:00',cores=1,processes=1,memory='6GB',\
                 working_directory="./",job_extra=[]):
        self.local = local
        self.n_workers = n_workers
        self.walltime = walltime
        self.queue = queue
        self.death_timeout = death_timeout
        self.processes = processes
        self.memory = memory
        self.cores = cores
        self.working_directory = working_directory
        self.job_extra = job_extra

        writedir(working_directory, overwrite=False)

    def startdask(self):
        if self.local:
            self.daskclient = Client()
            self.daskclient.cluster.scale(self.n_workers)
        else:
            self.daskcluster = SLURMCluster(queue=self.queue,death_timeout=self.death_timeout,walltime=self.walltime,\
                                   processes=self.processes,memory=self.memory,\
                                  cores=self.cores,local_directory=self.working_directory,\
                                log_directory=self.working_directory,job_extra=self.job_extra)
            self.workers = self.daskcluster.start_workers(self.n_workers)
            self.daskclient = Client(self.daskcluster)

    def shutdown(self):
        self.daskclient.restart()
        if not self.local:
            self.daskcluster.stop_all_jobs()
            self.daskcluster.close()
        for item in os.listdir(self.working_directory):
            if "worker-" in item or "slurm-" in item or ".lock" in item:
                path = "./" + item
                if os.path.isfile(path):
                    os.remove(path)
                elif os.path.isdir(path):
                    shutil.rmtree(path)

    def printprogress(self):
        complete = len(
            [item for item in self.futures if item.status == "finished"])
        print(str(complete) + "/" + str(len(self.futures)))

    def displaydashboard(self):
        link = self.daskcluster.dashboard_link
        display(HTML('<a href="' + link + '">Dashboard</a>'))

    def mapfovs(self, function, fov_list, retries=0):
        self.function = function
        self.retries = retries

        def mapallfovs(fov_number, function=function):
            function(fov_number)

        self.futures = {}
        for fov in fov_list:
            future = self.daskclient.submit(mapallfovs, fov, retries=retries)
            self.futures[fov] = future

    def retry_failed(self):
        self.failed_fovs = [
            fov for fov, future in self.futures.items()
            if future.status != 'finished'
        ]
        out = self.daskclient.restart()
        self.mapfovs(self.function, self.failed_fovs, retries=self.retries)

    def retry_processing(self):
        self.proc_fovs = [
            fov for fov, future in self.futures.items()
            if future.status == 'pending'
        ]
        out = self.daskclient.restart()
        self.mapfovs(self.function, self.proc_fovs, retries=self.retries)
Ejemplo n.º 11
0
def full_pipeline_dask():
    job_name = 'grappa'
    acceleration_factors = [4, 8]
    contrasts = ['CORPDFS_FBK', 'CORPD_FBK']
    # inference and eval
    inference_eval_cluster = SLURMCluster(
        cores=1,
        job_cpu=10,
        memory='20GB',
        job_name=job_name,
        walltime='1:00:00',
        interface='ib0',
        job_extra=[
            f'--gres=gpu:0',
            '--qos=qos_gpu-t3',
            '--distribution=block:block',
            '--hint=nomultithread',
            '--output=%x_%j.out',
        ],
        env_extra=[
            'cd $WORK/fastmri-reproducible-benchmark',
            '. ./submission_scripts_jean_zay/env_config.sh',
        ],
    )
    inference_eval_cluster.scale(8)
    client = Client(inference_eval_cluster)
    inference_futures = []
    eval_futures = []
    for af in acceleration_factors:
        for contrast in contrasts:
            inference_futures += [
                client.submit(
                    # function to execute
                    grappa_inference,
                    contrast=contrast,
                    af=af,
                    exp_id=job_name,
                    # this function has potential side effects
                    pure=True,
                )
            ]
            eval_futures += [
                client.submit(
                    # function to execute
                    eval_grappa,
                    contrast=contrast,
                    af=af,
                    n_samples=50,
                    # this function has potential side effects
                    pure=True,
                )
            ]
    client.gather(inference_futures)
    # eval printing
    i = 0
    for af in acceleration_factors:
        for contrast in contrasts:
            m = client.gather(eval_futures[i])
            print('AF', af)
            print('Contrast', contrast)
            print(m)
            i += 1
    print('Shutting down dask workers')
    client.close()
    inference_eval_cluster.close()
def train_eval_plug_and_play(
    contrast='CORPD_FBK',
    n_epochs=200,
    n_samples=None,
    af=4,
    n_primal=5,
    loss='compound_mssim',
    train_partition='gpu_p1',
    model_name=None,
    model_size=None,
):
    job_name = 'plug_and_play'
    model_specs = list(get_model_specs(force_res=False, n_primal=n_primal))
    if model_name is not None:
        model_specs = [ms for ms in model_specs if ms[0] == model_name]
    if model_size is not None:
        model_specs = [ms for ms in model_specs if ms[1] == model_size]
    n_models = len(model_specs)
    train_cluster = SLURMCluster(
        cores=1,
        job_cpu=20,
        memory='80GB',
        job_name=job_name,
        walltime='60:00:00',
        interface='ib0',
        job_extra=[
            '--gres=gpu:1',
            '--qos=qos_gpu-t4',
            '--distribution=block:block',
            '--hint=nomultithread',
            '--output=%x_%j.out',
            f'--partition {train_partition}',
        ],
        env_extra=[
            'cd $WORK/fastmri-reproducible-benchmark',
            '. ./submission_scripts_jean_zay/env_config.sh',
        ],
    )
    train_cluster.adapt(minimum_jobs=0, maximum_jobs=n_models)
    client = Client(train_cluster)
    futures = [
        client.submit(
            # function to execute
            train_xpdnet,
            model_fun=model_fun,
            model_kwargs=kwargs,
            model_size=model_size,
            multicoil=False,
            n_scales=n_scales,
            res=res,
            n_primal=n_primal,
            contrast=contrast,
            n_epochs=n_epochs,
            n_samples=n_samples,
            af=af,
            loss=loss,
        ) for _, model_size, model_fun, kwargs, _, n_scales, res in model_specs
    ]
    run_ids = client.gather(futures)
    client.close()
    train_cluster.close()
    # eval
    eval_plug_and_play(
        run_ids,
        job_name=job_name,
        contrast=contrast,
        n_epochs=n_epochs,
        af=af,
        n_primal=n_primal,
        model_name=model_name,
        model_size=model_size,
        n_samples_train=n_samples,
    )
    return run_ids
Ejemplo n.º 13
0
def run_benchmarks(args: Args):
    # Results are stored as they are returned
    all_results = {}

    # Try running the benchmarks
    try:
        # Get benchmark resources dir
        resources_dir = Path(
        ).parent.parent / "aicsimageio" / "tests" / "resources"

        # Store machine config
        _ = {
            "platform": platform.system(),
            "platform_version": platform.version(),
            "architecture": platform.machine(),
            "cpu_total_count": psutil.cpu_count(),
            "cpu_current_utilization": psutil.cpu_percent(),
            "memory_total_gb": psutil.virtual_memory().total / 10e8,
            "memory_available_gb": psutil.virtual_memory().available / 10e8,
        }

        # Store python config
        pyversion = sys.version_info
        _ = {
            "python_version":
            f"{pyversion.major}.{pyversion.minor}.{pyversion.micro}",
            "aicsimageio": aicsimageio.__version__,
            "czifile": czifile.__version__,
            "imageio": imageio.__version__,
            "tifffile": tifffile.__version__,
        }

        # Run tests
        #######################################################################

        log.info(f"Running tests: no cluster...")
        log.info(f"=" * 80)

        all_results["no-cluster"] = _run_benchmark_suite(
            resources_dir=resources_dir)

        #######################################################################

        for cluster_config in CLUSTER_CONFIGS:
            total_cores = cluster_config["per_worker_cores"] * cluster_config[
                "workers"]
            log.info(f"Running tests: {cluster_config['name']} "
                     f"(Total cores: {total_cores}) ...")
            log.info(f"=" * 80)

            # Create or get log dir
            # Do not include ms
            log_dir_name = datetime.now().isoformat().split(".")[0]
            log_dir = Path(f".dask_logs/{log_dir_name}").expanduser()
            # Log dir settings
            log_dir.mkdir(parents=True, exist_ok=True)

            # Calc per_worker_memory
            per_worker_memory = cluster_config["per_worker_cores"] * 2
            per_worker_memory = f"{per_worker_memory}GB"

            # Create cluster
            cluster = SLURMCluster(
                cores=cluster_config["per_worker_cores"],
                memory=per_worker_memory,
                queue="aics_cpu_general",
                walltime="10:00:00",
                local_directory=str(log_dir),
                log_directory=str(log_dir),
            )

            # Scale cluster
            cluster.scale(cluster_config["workers"])

            # Create client connection
            client = Client(cluster)

            # Wait for a minute for the cluster to fully spin up
            time.sleep(60)

            # Run benchmark
            all_results[cluster_config["name"]] = _run_benchmark_suite(
                resources_dir=resources_dir)

            client.shutdown()
            cluster.close()

            # Wait for a minute for the cluster to fully shutdown
            time.sleep(60)

        #######################################################################

        log.info(f"Completed all tests")
        log.info(f"=" * 80)

        # Ensure save dir exists and save results
        args.save_path.parent.mkdir(parents=True, exist_ok=True)
        with open(args.save_path, "w") as write_out:
            json.dump(all_results, write_out)

        # Construct and push package
        if args.upload:
            p = Package()
            p.set("results.json", args.save_path)
            p.push(
                "aicsimageio/benchmarks",
                "s3://aics-modeling-packages-test-resources",
                message=f"aicsimageio version: {aicsimageio.__version__}",
            )

    # Catch any exception
    except Exception as e:
        log.error("=============================================")
        if args.debug:
            log.error("\n\n" + traceback.format_exc())
            log.error("=============================================")
        log.error("\n\n" + str(e) + "\n")
        log.error("=============================================")
        sys.exit(1)
def main():
    # Distributed host for

    p = argparse.ArgumentParser(prog="process", description="Process the FOV pipeline")

    p.add_argument(
        "--min_jobs", type=int, default=300, help="Minimum number of jobs to use",
    )

    p.add_argument(
        "--max_jobs", type=int, default=300, help="Maximum number of jobs to use",
    )
    p.add_argument(
        "--walltime", type=int, default=5, help="Walltime in hours",
    )
    p.add_argument(
        "--up_time", type=int, default=10, help="up time for the scheduler in hours",
    )

    args = p.parse_args()

    cluster = SLURMCluster(
        cores=2,
        memory="16GB",
        walltime="{}:00:00".format(args.walltime),
        queue="aics_cpu_general",
    )

    cluster.adapt(minimum_jobs=args.min_jobs, maximum_jobs=args.max_jobs)
    client = dask.distributed.Client(cluster)  # noqa

    connection_info = {}
    connection_info["HOSTNAME"] = socket.gethostname()
    connection_info["PORT"] = cluster.scheduler_info["address"].split(":")[-1]
    connection_info["DASHBOARD_PORT"] = cluster.scheduler_info["services"]["dashboard"]

    connection_str = (
        "ssh -A -J slurm-master -L {PORT}:{HOSTNAME}:{PORT} -L "
        "{DASHBOARD_PORT}:{HOSTNAME}:{DASHBOARD_PORT} {HOSTNAME}".format(
            **connection_info
        )
    )

    log.info(
        (
            "In a new terimal the machine that you run the pipeline on, copy and paste the following string to forward "
            "ports to this server:"
        )
    )
    log.info(connection_str)
    log.info(" ")
    log.info("Then use the following command to kick off your FPP jobs:")
    log.info("fpp_process --distributed 1 --port {PORT}".format(**connection_info))
    log.info(" ")
    log.info("You can see the dashboard on:")
    log.info("localhost:{PORT}".format(**connection_info))
    log.info(" ")
    log.info("Command + C will teardown the server.")

    try:
        time.sleep(args.up_time * 60 * 60)
    except KeyboardInterrupt:
        log.info("Tearing down scheduler.")
        cluster.close()
Ejemplo n.º 15
0
def esi_cluster_setup(partition="8GBS", n_jobs=2, mem_per_job=None,
                      timeout=180, interactive=True, start_client=True,
                      **kwargs):
    """
    Start a distributed Dask cluster of parallel processing workers using SLURM 
    (or local multi-processing)
    
    Parameters
    ----------
    partition : str
        Name of SLURM partition/queue to use
    n_jobs : int
        Number of jobs to spawn
    mem_per_job : None or str
        Memory booking for each job. Can be specified either in megabytes 
        (e.g., ``mem_per_job = 1500MB``) or gigabytes (e.g., ``mem_per_job = "2GB"``). 
        If `mem_per_job` is `None`, it is attempted to infer a sane default value
        from the chosen queue, e.g., for ``partition = "8GBS"`` `mem_per_job` is 
        automatically set to the allowed maximum of `'8GB'`. However, even in
        queues with guaranted memory bookings, it is possible to allocate less
        memory than the allowed maximum per job to spawn numerous low-memory 
        jobs. See Examples for details. 
    timeout : int
        Number of seconds to wait for requested jobs to start up. 
    interactive : bool
        If `True`, user input is required in case not all jobs could 
        be started in the provided waiting period (determined by `timeout`). 
        If `interactive` is `False` and the jobs could not be started
        within `timeout` seconds, a `TimeoutError` is raised. 
    start_client : bool
        If `True`, a distributed computing client is launched and attached to
        the workers. If `start_client` is `False`, only a distributed 
        computing cluster is started to which compute-clients can connect. 
    **kwargs : dict
        Additional keyword arguments can be used to control job-submission details. 
        
    Returns
    -------
    proc : object
        A distributed computing client (if ``start_client = True``) or 
        a distributed computing cluster (otherwise). 

    Examples
    --------
    The following command launches 10 SLURM jobs with 2 gigabytes memory each 
    in the `8GBS` partition
    
    >>> spy.esi_cluster_setup(n_jobs=10, partition="8GBS", mem_per_job="2GB") 
    
    If you want to access properties of the created distributed computing client, 
    assign an explicit return quantity, i.e., 
    
    >>> client = spy.esi_cluster_setup(n_jobs=10, partition="8GBS", mem_per_job="2GB") 
    
    The underlying distributed computing cluster can be accessed using
    
    >>> client.cluster
    
    Notes
    -----
    Syncopy's parallel computing engine relies on the concurrent processing library
    `Dask <https://docs.dask.org/en/latest/>`_. Thus, the distributed computing
    clients used by Syncopy are in fact instances of :class:`dask.distributed.Client`. 
    This function specifically acts  as a wrapper for :class:`dask_jobqueue.SLURMCluster`. 
    Users familiar with Dask in general and its distributed scheduler and cluster 
    objects in particular, may leverage Dask's entire API to fine-tune parallel 
    processing jobs to their liking (if wanted). 
    
    See also
    --------
    cluster_cleanup : remove dangling parallel processing job-clusters
    """
    
    # For later reference: dynamically fetch name of current function
    funcName = "Syncopy <{}>".format(inspect.currentframe().f_code.co_name)
    
    # Be optimistic: prepare success message
    successMsg = "{name:s} Cluster dashboard accessible at {dash:s}"

    # Retrieve all partitions currently available in SLURM
    out, err = subprocess.Popen("sinfo -h -o %P",
                                stdout=subprocess.PIPE, stderr=subprocess.PIPE,
                                text=True, shell=True).communicate()
    if len(err) > 0:
        
        # SLURM is not installed, either allocate `LocalCluster` or just leave
        if "sinfo: not found" in err:
            if interactive:
                msg = "{name:s} SLURM does not seem to be installed on this machine " +\
                    "({host:s}). Do you want to start a local multi-processing " +\
                    "computing client instead? "
                startLocal = user_yesno(msg.format(name=funcName, host=socket.gethostname()), 
                                        default="no")
            else:
                startLocal = True
            if startLocal:
                client = Client()
                successMsg = "{name:s} Local parallel computing client ready. \n" + successMsg
                print(successMsg.format(name=funcName, dash=client.cluster.dashboard_link))
                if start_client:
                    return client
                return client.cluster
            return 

        # SLURM is installed, but something's wrong        
        msg = "SLURM queuing system from node {node:s}. " +\
              "Original error message below:\n{error:s}"
        raise SPYIOError(msg.format(node=socket.gethostname(), error=err))
    options = out.split()

    # Make sure we're in a valid partition (exclude IT partitions from output message)
    if partition not in options:
        valid = list(set(options).difference(["DEV", "PPC"]))
        raise SPYValueError(legal="'" + "or '".join(opt + "' " for opt in valid),
                            varname="partition", actual=partition)

    # Parse job count
    try:
        scalar_parser(n_jobs, varname="n_jobs", ntype="int_like", lims=[1, np.inf])
    except Exception as exc:
        raise exc

    # Get requested memory per job
    if mem_per_job is not None:
        if not isinstance(mem_per_job, str):
            raise SPYTypeError(mem_per_job, varname="mem_per_job", expected="string")
        if not any(szstr in mem_per_job for szstr in ["MB", "GB"]):
            lgl = "string representation of requested memory (e.g., '8GB', '12000MB')"
            raise SPYValueError(legal=lgl, varname="mem_per_job", actual=mem_per_job)

    # Query memory limit of chosen partition and ensure that `mem_per_job` is
    # set for partitions w/o limit
    idx = partition.find("GB")
    if idx > 0:
        mem_lim = int(partition[:idx]) * 1000
    else:
        if partition == "PREPO":
            mem_lim = 16000
        else:
            if mem_per_job is None:
                lgl = "explicit memory amount as required by partition '{}'"
                raise SPYValueError(legal=lgl.format(partition),
                                    varname="mem_per_job", actual=mem_per_job)
        mem_lim = np.inf

    # Consolidate requested memory with chosen partition (or assign default memory)
    if mem_per_job is None:
        mem_per_job = str(mem_lim) + "MB"
    else:
        if "MB" in mem_per_job:
            mem_req = int(mem_per_job[:mem_per_job.find("MB")])
        else:
            mem_req = int(round(float(mem_per_job[:mem_per_job.find("GB")]) * 1000))
        if mem_req > mem_lim:
            msg = "`mem_per_job` exceeds limit of {lim:d}GB for partition {par:s}. " +\
                "Capping memory at partition limit. "
            SPYWarning(msg.format(lim=mem_lim, par=partition))
            mem_per_job = str(int(mem_lim)) + "GB"

    # Parse requested timeout period
    try:
        scalar_parser(timeout, varname="timeout", ntype="int_like", lims=[1, np.inf])
    except Exception as exc:
        raise exc

    # Determine if cluster allocation is happening interactively
    if not isinstance(interactive, bool):
        raise SPYTypeError(interactive, varname="interactive", expected="bool")

    # Determine if a dask client was requested
    if not isinstance(start_client, bool):
        raise SPYTypeError(start_client, varname="start_client", expected="bool")

    # Set/get "hidden" kwargs
    workers_per_job = kwargs.get("workers_per_job", 1)
    try:
        scalar_parser(workers_per_job, varname="workers_per_job",
                      ntype="int_like", lims=[1, 8])
    except Exception as exc:
        raise exc

    n_cores = kwargs.get("n_cores", 1)
    try:
        scalar_parser(n_cores, varname="n_cores",
                      ntype="int_like", lims=[1, np.inf])
    except Exception as exc:
        raise exc

    slurm_wdir = kwargs.get("slurmWorkingDirectory", None)
    if slurm_wdir is None:
        usr = getpass.getuser()
        slurm_wdir = "/mnt/hpx/slurm/{usr:s}/{usr:s}_{date:s}"
        slurm_wdir = slurm_wdir.format(usr=usr,
                                       date=datetime.now().strftime('%Y%m%d-%H%M%S'))
        os.makedirs(slurm_wdir, exist_ok=True)
    else:
        try:
            io_parser(slurm_wdir, varname="slurmWorkingDirectory", isfile=False)
        except Exception as exc:
            raise exc
        
    # Hotfix for upgraded cluster-nodes: point to correct Python executable if working from /home
    pyExec = sys.executable
    if sys.executable.startswith("/home"):
        pyExec = "/mnt/gs" + sys.executable
        
    # Create `SLURMCluster` object using provided parameters
    out_files = os.path.join(slurm_wdir, "slurm-%j.out")
    cluster = SLURMCluster(cores=n_cores,
                           memory=mem_per_job,
                           processes=workers_per_job,
                           local_directory=slurm_wdir,
                           queue=partition,
                           name="spyswarm",
                           python=pyExec,
                           header_skip=["-t", "--mem"],
                           job_extra=["--output={}".format(out_files)])
                           # interface="asdf", # interface is set via `psutil.net_if_addrs()`
                           # job_extra=["--hint=nomultithread",
                           #            "--threads-per-core=1"]
                           
    # Compute total no. of workers and up-scale cluster accordingly
    total_workers = n_jobs * workers_per_job
    cluster.scale(total_workers)

    # Fire up waiting routine to avoid premature cluster setups
    if _cluster_waiter(cluster, funcName, total_workers, timeout, interactive):
        return
    
    # Kill a zombie cluster in non-interactive mode
    if not interactive and _count_running_workers(cluster) == 0:
        cluster.close()
        err = "SLURM jobs could not be started within given time-out " +\
              "interval of {0:d} seconds"
        raise TimeoutError(err.format(timeout))
    
    # Highlight how to connect to dask performance monitor
    print(successMsg.format(name=funcName, dash=cluster.dashboard_link))

    # If client was requested, return that instead of the created cluster
    if start_client:
        return Client(cluster)
    return cluster
def train_eval_parameter_grid(job_name, train_function, eval_function,
                              parameter_grid):
    parameters = list(ParameterGrid(parameter_grid))
    n_parameters_config = len(parameters)
    train_cluster = SLURMCluster(
        cores=1,
        job_cpu=20,
        memory='80GB',
        job_name=job_name,
        walltime='60:00:00',
        interface='ib0',
        job_extra=[
            f'--gres=gpu:1',
            '--qos=qos_gpu-t4',
            '--distribution=block:block',
            '--hint=nomultithread',
            '--output=%x_%j.out',
        ],
        env_extra=[
            'cd $WORK/fastmri-reproducible-benchmark',
            '. ./submission_scripts_jean_zay/env_config.sh',
        ],
    )
    train_cluster.scale(n_parameters_config)
    client = Client(train_cluster)
    futures = [
        client.submit(
            # function to execute
            train_function,
            **params,
        ) for params in parameters
    ]
    run_ids = client.gather(futures)
    client.close()
    train_cluster.close()
    # eval
    eval_cluster = SLURMCluster(
        cores=1,
        job_cpu=40,
        memory='80GB',
        job_name=job_name,
        walltime='20:00:00',
        interface='ib0',
        job_extra=[
            f'--gres=gpu:4',
            '--qos=qos_gpu-t3',
            '--distribution=block:block',
            '--hint=nomultithread',
            '--output=%x_%j.out',
        ],
        env_extra=[
            'cd $WORK/fastmri-reproducible-benchmark',
            '. ./submission_scripts_jean_zay/env_config.sh',
        ],
    )
    eval_cluster.scale(n_parameters_config)
    client = Client(eval_cluster)
    for params in parameters:
        params.pop('n_samples')
    futures = [
        client.submit(
            # function to execute
            eval_function,
            run_id=run_id,
            n_samples=50,
            **params,
        ) for run_id, params in zip(run_ids, parameters)
    ]

    for params, future in zip(parameters, futures):
        metrics_names, eval_res = client.gather(future)
        print('Parameters', params)
        print(metrics_names)
        print(eval_res)
    print('Shutting down dask workers')
    client.close()
    eval_cluster.close()
Ejemplo n.º 17
0
    def run(
        self,
        distributed: bool = False,
        clean: bool = False,
        debug: bool = False,
        **kwargs,
    ):
        """
        Run a flow with your steps.

        Parameters
        ----------
        distributed: bool
            Create a SLURMCluster to use for job distribution.
            Default: False (do not create a cluster)
        clean: bool
            Should the local staging directory be cleaned prior to this run.
            Default: False (Do not clean)
        debug: bool
            A debug flag for the developer to use to manipulate how much data runs,
            how it is processed, etc.
            Default: False (Do not debug)

        Notes
        -----
        Documentation on prefect:
        https://docs.prefect.io/core/

        Basic prefect example:
        https://docs.prefect.io/core/
        """
        # Initalize steps
        raw = steps.MappedRaw()
        invert = steps.MappedInvert()
        cumsum = steps.MappedSum()
        plot = steps.Plot()
        fancyplot = steps.Fancyplot()

        # Choose executor
        if distributed:
            # Log dir settings
            log_dir_name = datetime.now().isoformat().split(".")[0]  # Do not include ms
            log_dir = Path(f".logs/{log_dir_name}/")
            log_dir.mkdir(parents=True)

            # Spawn cluster
            cluster = SLURMCluster(
                cores=2,
                memory="32GB",
                walltime="10:00:00",
                queue="aics_cpu_general",
                local_directory=str(log_dir),
                log_directory=str(log_dir),
            )

            # Set adaptive scaling
            cluster.adapt(minimum_jobs=1, maximum_jobs=40)

        else:
            # Stop conflicts between Dask and OpenBLAS
            # Info here:
            # https://stackoverflow.com/questions/45086246/too-many-memory-regions-error-with-dask
            os.environ["OMP_NUM_THREADS"] = "1"

            # Spawn local cluster
            cluster = LocalCluster()

        # Log bokeh info
        if cluster.dashboard_link:
            log.info(f"Dask UI running at: {cluster.dashboard_link}")

        # Start local dask cluster
        exe = DaskExecutor(cluster.scheduler_address)

        # Configure your flow
        with Flow("example_step_workflow") as flow:
            # If your step utilizes a secondary flow with dask pass the executor address
            # If you want to clean the local staging directories pass clean
            # If you want to utilize some debugging functionality pass debug
            # If you don't utilize any of these, just pass the parameters you need.
            matrices = raw(
                distributed_executor_address=cluster.scheduler_address,
                clean=clean,
                debug=debug,
                **kwargs,  # Allows us to pass `--n {some integer}` or other params
            )
            inversions = invert(
                matrices,
                distributed_executor_address=cluster.scheduler_address,
                clean=clean,
                debug=debug,
            )
            vectors = cumsum(
                inversions,
                distributed_executor_address=cluster.scheduler_address,
                clean=clean,
                debug=debug,
            )
            plot(
                vectors,
                distributed_executor_address=cluster.scheduler_address,
                clean=clean,
                debug=debug,
            )
            fancyplot(
                vectors,
                distributed_executor_address=cluster.scheduler_address,
                clean=clean,
                debug=debug,
            )

        # Run flow and get ending state
        state = flow.run(executor=exe)

        # Get plot location
        log.info(f"Plot stored to: {plot.get_result(state, flow)}")

        # Close cluster
        if distributed:
            cluster.close()