Beispiel #1
0
def run_HPC():
        
    #################
    # Setup dask cluster
    #################
    
    config = utils.read_config()
    num_workers = config["num_hipergator_workers"]
    
    #job args
    extra_args=[
        "--error=/home/b.weinstein/logs/dask-worker-%j.err",
        "--account=ewhite",
        "--output=/home/b.weinstein/logs/dask-worker-%j.out"
    ]
    
    cluster = SLURMCluster(
        processes=2,
        queue='hpg2-compute',
        cores=3, 
        memory='11GB', 
        walltime='24:00:00',
        job_extra=extra_args,
        local_directory="/home/b.weinstein/logs/", death_timeout=150)
    
    print(cluster.job_script())
    cluster.adapt(minimum=num_workers, maximum=num_workers)
    
    dask_client = Client(cluster)
    
    #Start dask
    dask_client.run_on_scheduler(start_tunnel)  
    run(config, debug=False)
Beispiel #2
0
def slurm_cluster(n_workers, cores_per_worker, mem_per_worker, walltime,
                  dask_folder):
    """helper function to start a Dask Slurm-based cluster

    :param n_workers: maximum number of workers to use
    :param cores_per_worker: number of cores per worker
    :param mem_per_worker: maximum of RAM for workers
    :param walltime: maximum time for workers
    :param dask_folder: folder to keep workers temporary data
    """
    dask.config.set({
        "distributed.worker.memory.target": False,  # avoid spilling to disk
        "distributed.worker.memory.spill": False,  # avoid spilling to disk
    })
    cluster = SLURMCluster(
        cores=cores_per_worker,
        processes=1,
        memory=mem_per_worker,
        walltime=walltime,
        log_directory=dask_folder /
        "logs",  # folder for SLURM logs for each worker
        local_directory=dask_folder,  # folder for workers data
    )
    cluster.adapt(minimum=1, maximum=n_workers)

    client = Client(cluster)
    return client
def start_dask_cluster(number_of_workers, mem_size="10GB"):

    #################
    # Setup dask cluster
    #################

    #job args
    extra_args = [
        "--error=/home/b.weinstein/logs/dask-worker-%j.err",
        "--account=ewhite",
        "--output=/home/b.weinstein/logs/dask-worker-%j.out"
    ]

    cluster = SLURMCluster(processes=1,
                           queue='hpg2-compute',
                           cores=1,
                           memory=mem_size,
                           walltime='24:00:00',
                           job_extra=extra_args,
                           local_directory="/home/b.weinstein/logs/dask/",
                           death_timeout=300)

    print(cluster.job_script())
    cluster.adapt(minimum=number_of_workers, maximum=number_of_workers)

    dask_client = Client(cluster)

    #Start dask
    dask_client.run_on_scheduler(start_tunnel)

    return dask_client
Beispiel #4
0
def dask_slurm_cluster(queue=None,
                       cores=None,
                       memory=None,
                       minimum_workers=None,
                       maximum_workers=None,
                       address=None,
                       port=None,
                       **kwargs):
    __doc__ = _doc_dask_slurm_cluster  # noqa

    queue = queue or DEFAULT_QUEUE
    cores = cores or DEFAULT_NUM_CORES
    memory = memory or DEFAULT_MEMORY
    minimum_workers = minimum_workers or DEFAULT_MINIMUM_WORKERS
    maximum_workers = maximum_workers or DEFAULT_MAXIMUM_WORKERS
    address = address or DEFAULT_ADDRESS
    port = port or DEFAULT_PORT

    cluster = SLURMCluster(queue=queue,
                           cores=cores,
                           memory=memory,
                           host=f'tcp://{address}:{port}',
                           **kwargs)
    cluster.adapt(minimum=minimum_workers, maximum=maximum_workers)
    return cluster
Beispiel #5
0
def train_eval_dealiasers(contrast='CORPD_FBK', n_epochs=200, n_samples=None, model_name=None, model_size=None, loss='mae'):
    job_name = 'dealiasing_fastmri'
    model_specs = list(get_model_specs(force_res=True, dealiasing=True))
    if model_name is not None:
        model_specs = [ms for ms in model_specs if ms[0] == model_name]
    if model_size is not None:
        model_specs = [ms for ms in model_specs if ms[1] == model_size]
    n_models = len(model_specs)
    train_cluster = SLURMCluster(
        cores=1,
        job_cpu=20,
        memory='80GB',
        job_name=job_name,
        walltime='20:00:00',
        interface='ib0',
        job_extra=[
            f'--gres=gpu:1',
            '--qos=qos_gpu-t3',
            '--distribution=block:block',
            '--hint=nomultithread',
            '--output=%x_%j.out',
        ],
        env_extra=[
            'cd $WORK/fastmri-reproducible-benchmark',
            '. ./submission_scripts_jean_zay/env_config.sh',
        ],
    )
    train_cluster.adapt(minimum_jobs=0, maximum_jobs=n_models)
    client = Client(train_cluster)
    futures = [client.submit(
        # function to execute
        train_dealiaser,
        model_fun=model_fun,
        model_kwargs=kwargs,
        run_id=f'{model_name}_{model_size}',
        n_scales=n_scales,
        contrast=contrast,
        n_epochs=n_epochs,
        n_samples=n_samples,
        loss=loss,
    ) for model_name, model_size, model_fun, kwargs, _, n_scales, _ in model_specs]
    run_ids = client.gather(futures)
    client.close()
    train_cluster.close()
    # eval
    eval_dealiasers(
        run_ids,
        job_name=job_name,
        contrast=contrast,
        n_epochs=n_epochs,
        model_name=model_name,
        model_size=model_size,
        n_samples_train=n_samples,
        loss=loss,
    )
    return run_ids
Beispiel #6
0
def _slurmclient(memory: int,
                 partition="epp,taskfarm",
                 account="epp") -> Client:
    # For slurm usage instructions see:
    # https://wiki.csc.warwick.ac.uk/twiki/bin/view/Desktop2018/CowUserGuide
    cluster = SLURMCluster(queue=partition,
                           memory=memory,
                           project=account,
                           cores=1,
                           walltime="24:00:00")
    cluster.adapt(minimum_jobs=1, maximum_jobs=200)
    return Client(address=cluster)
Beispiel #7
0
def dask_slurm_cluster(queue=None,
                       cores=None,
                       memory=None,
                       minimum_workers=None,
                       maximum_workers=None):
    queue = queue or DEFAULT_QUEUE
    cores = cores or DEFAULT_NUM_CORES
    memory = memory or DEFAULT_MEMORY
    minimum_workers = minimum_workers or DEFAULT_MINIMUM_WORKERS
    maximum_workers = maximum_workers or DEFAULT_MAXIMUM_WORKERS

    cluster = SLURMCluster(queue=queue, cores=cores, memory=memory)
    cluster.adapt(minimum=minimum_workers, maximum=maximum_workers)
    return cluster
Beispiel #8
0
def run_HPC(data_paths):

    #################
    # Setup dask cluster
    #################

    from dask_jobqueue import SLURMCluster
    from dask.distributed import Client, wait

    DeepForest_config = config.load_config()
    num_workers = DeepForest_config["num_hipergator_workers"]

    #job args
    extra_args = [
        "--error=/home/b.weinstein/logs/dask-worker-%j.err",
        "--account=ewhite",
        "--output=/home/b.weinstein/logs/dask-worker-%j.out"
    ]

    cluster = SLURMCluster(processes=1,
                           queue='hpg2-compute',
                           cores=1,
                           memory='13GB',
                           walltime='24:00:00',
                           job_extra=extra_args,
                           local_directory="/home/b.weinstein/logs/",
                           death_timeout=300)

    print(cluster.job_script())
    cluster.adapt(minimum=num_workers, maximum=num_workers)

    dask_client = Client(cluster)

    #Start dask
    dask_client.run_on_scheduler(start_tunnel)

    for site in data_paths:
        futures = dask_client.map(Generate.run,
                                  data_paths[site],
                                  site=site,
                                  DeepForest_config=DeepForest_config)
        wait(futures)
        print("{} complete".format(site))

    print("All sites complete")
def start_slurm_scheduler(account, cores, walltime, memory, processes, interface, local_dir,
                          scheduler_port, dash_port,
                          num_workers, adapt_min, adapt_max):

    # choose either adaptive mode or fixed number of walkers mode (you
    # can always connect to and scale manually without adapt mode),
    # but adapt mode is the default since it is the most no nonsense
    # DWIM approach
    adapt_mode = True
    if num_workers > -1:
        adapt_mode = False

    local_cluster_kwargs = {'scheduler_port' : scheduler_port,
                            'dashboard_address' : ':{}'.format(dash_port)}

    cluster = SLURMCluster(project=account,
                           cores=cores,
                           walltime=walltime,
                           memory=memory,
                           processes=processes,
                           interface=interface,
                           **local_cluster_kwargs)
    with cluster:

        click.echo("Scheduler address: {}".format(cluster.scheduler_address))
        click.echo("Dashboard port: {}".format(cluster.dashboard_link))

        if adapt_mode:
            cluster.adapt(minimum=adapt_min, maximum=adapt_max)
        else:
            cluster.scale(num_workers)



        # loop forever to block
        while True:
            # sleep so we avoid evaluating the loop to frequently
            time.sleep(2)
def train_eval_plug_and_play(
    contrast='CORPD_FBK',
    n_epochs=200,
    n_samples=None,
    af=4,
    n_primal=5,
    loss='compound_mssim',
    train_partition='gpu_p1',
    model_name=None,
    model_size=None,
):
    job_name = 'plug_and_play'
    model_specs = list(get_model_specs(force_res=False, n_primal=n_primal))
    if model_name is not None:
        model_specs = [ms for ms in model_specs if ms[0] == model_name]
    if model_size is not None:
        model_specs = [ms for ms in model_specs if ms[1] == model_size]
    n_models = len(model_specs)
    train_cluster = SLURMCluster(
        cores=1,
        job_cpu=20,
        memory='80GB',
        job_name=job_name,
        walltime='60:00:00',
        interface='ib0',
        job_extra=[
            '--gres=gpu:1',
            '--qos=qos_gpu-t4',
            '--distribution=block:block',
            '--hint=nomultithread',
            '--output=%x_%j.out',
            f'--partition {train_partition}',
        ],
        env_extra=[
            'cd $WORK/fastmri-reproducible-benchmark',
            '. ./submission_scripts_jean_zay/env_config.sh',
        ],
    )
    train_cluster.adapt(minimum_jobs=0, maximum_jobs=n_models)
    client = Client(train_cluster)
    futures = [
        client.submit(
            # function to execute
            train_xpdnet,
            model_fun=model_fun,
            model_kwargs=kwargs,
            model_size=model_size,
            multicoil=False,
            n_scales=n_scales,
            res=res,
            n_primal=n_primal,
            contrast=contrast,
            n_epochs=n_epochs,
            n_samples=n_samples,
            af=af,
            loss=loss,
        ) for _, model_size, model_fun, kwargs, _, n_scales, res in model_specs
    ]
    run_ids = client.gather(futures)
    client.close()
    train_cluster.close()
    # eval
    eval_plug_and_play(
        run_ids,
        job_name=job_name,
        contrast=contrast,
        n_epochs=n_epochs,
        af=af,
        n_primal=n_primal,
        model_name=model_name,
        model_size=model_size,
        n_samples_train=n_samples,
    )
    return run_ids
    #treefile = '../validation_data/16s/16s_salaminWstruct_aln.fasta.treefile'
    #alnfile = '../validation_data/16s/16s_salaminWstruct_aln.fasta'

    #treefile = '../validation_data/dengue/dengue_all.aln.fasta.treefile'
    #alnfile = '../validation_data/dengue/dengue_all.aln.fasta'


    #create distributed cluster
    from dask_jobqueue import SLURMCluster

    cluster = SLURMCluster(
        cores=10,
        memory="20 GB"
    )

    cluster.adapt( maximum=NCORE)
    print(cluster.dashboard_link)
    client = Client(cluster)
    print('cluster deploy sleep')
    #wait for cluster deploy
    time.sleep( 20 )
    print('done')


    #use blast based annotation to assign codons to column ranges
    allowed_symbols = [ b'A', b'C', b'G' , b'T' ]
    allowed_transitions = [ c1+c2 for c1 in allowed_symbols for c2 in allowed_symbols  if c1!= c2]
    print('allowed transitions',allowed_transitions)


    transition_dict = {  c : i  for i,c in enumerate( allowed_transitions )  }
def eval_plug_and_play(
    run_ids,
    job_name='eval_pandp',
    contrast='CORPD_FBK',
    n_samples_train=None,
    n_epochs=200,
    af=4,
    n_primal=5,
    train_partition='gpu_p1',
    model_name=None,
    model_size=None,
):
    model_specs = list(get_model_specs(force_res=False, n_primal=n_primal))
    if model_name is not None:
        model_specs = [ms for ms in model_specs if ms[0] == model_name]
    if model_size is not None:
        model_specs = [ms for ms in model_specs if ms[1] == model_size]
    n_models = len(model_specs)
    # eval
    eval_cluster = SLURMCluster(
        cores=1,
        job_cpu=40,
        memory='80GB',
        job_name=job_name,
        walltime='2:00:00',
        interface='ib0',
        job_extra=[
            f'--gres=gpu:1',
            '--qos=qos_gpu-t3',
            '--distribution=block:block',
            '--hint=nomultithread',
            '--output=%x_%j.out',
        ],
        env_extra=[
            'cd $WORK/fastmri-reproducible-benchmark',
            '. ./submission_scripts_jean_zay/env_config.sh',
        ],
    )
    eval_cluster.adapt(minimum_jobs=0, maximum_jobs=n_models)
    client = Client(eval_cluster)

    futures = [
        client.submit(
            # function to execute
            evaluate_xpdnet,
            model_fun=model_fun,
            model_kwargs=kwargs,
            run_id=run_id,
            multicoil=False,
            n_samples=50,
            contrast=contrast,
            af=af,
            n_epochs=n_epochs,
            n_scales=n_scales,
            res=res,
        ) for run_id, (_, _, model_fun, kwargs, _, n_scales,
                       res) in zip(run_ids, model_specs)
    ]

    df_results = pd.DataFrame(
        columns='model_name model_size psnr ssim'.split())

    for (name, model_size, _, _, _, _, _), future in zip(model_specs, futures):
        _, eval_res = client.gather(future)
        df_results = df_results.append(dict(
            model_name=name,
            model_size=model_size,
            psnr=eval_res[0],
            ssim=eval_res[1],
        ),
                                       ignore_index=True)

    print(df_results)
    outputs_file = f'reconstruction_results_{n_samples_train}.csv'
    if model_name is not None:
        outputs_file = f'reconstruction_results_{n_samples_train}_{model_name}.csv'
    df_results.to_csv(outputs_file)
    print('Shutting down dask workers')
    client.close()
    eval_cluster.close()
    return run_ids
def main():
    # Distributed host for

    p = argparse.ArgumentParser(prog="process", description="Process the FOV pipeline")

    p.add_argument(
        "--min_jobs", type=int, default=300, help="Minimum number of jobs to use",
    )

    p.add_argument(
        "--max_jobs", type=int, default=300, help="Maximum number of jobs to use",
    )
    p.add_argument(
        "--walltime", type=int, default=5, help="Walltime in hours",
    )
    p.add_argument(
        "--up_time", type=int, default=10, help="up time for the scheduler in hours",
    )

    args = p.parse_args()

    cluster = SLURMCluster(
        cores=2,
        memory="16GB",
        walltime="{}:00:00".format(args.walltime),
        queue="aics_cpu_general",
    )

    cluster.adapt(minimum_jobs=args.min_jobs, maximum_jobs=args.max_jobs)
    client = dask.distributed.Client(cluster)  # noqa

    connection_info = {}
    connection_info["HOSTNAME"] = socket.gethostname()
    connection_info["PORT"] = cluster.scheduler_info["address"].split(":")[-1]
    connection_info["DASHBOARD_PORT"] = cluster.scheduler_info["services"]["dashboard"]

    connection_str = (
        "ssh -A -J slurm-master -L {PORT}:{HOSTNAME}:{PORT} -L "
        "{DASHBOARD_PORT}:{HOSTNAME}:{DASHBOARD_PORT} {HOSTNAME}".format(
            **connection_info
        )
    )

    log.info(
        (
            "In a new terimal the machine that you run the pipeline on, copy and paste the following string to forward "
            "ports to this server:"
        )
    )
    log.info(connection_str)
    log.info(" ")
    log.info("Then use the following command to kick off your FPP jobs:")
    log.info("fpp_process --distributed 1 --port {PORT}".format(**connection_info))
    log.info(" ")
    log.info("You can see the dashboard on:")
    log.info("localhost:{PORT}".format(**connection_info))
    log.info(" ")
    log.info("Command + C will teardown the server.")

    try:
        time.sleep(args.up_time * 60 * 60)
    except KeyboardInterrupt:
        log.info("Tearing down scheduler.")
        cluster.close()
Beispiel #14
0
                    self._depencendies = " ".join(f"'{dep}'"
                                                  for dep in dependencies)

                def setup(self, worker: Worker):
                    os.system(f"pip install {self._depencendies}")

            dependency_installer = DependencyInstaller([
                "git+https://github.com/andrzejnovak/boostedhiggs.git#egg=boostedhiggs",
                "scipy==1.8.1",
            ])

            client = Client("tls://localhost:8786")
            client.register_worker_plugin(dependency_installer)

        else:
            cluster.adapt(minimum=args.scaleout)
            client = Client(cluster)
            print("Waiting for at least one worker...")
            client.wait_for_workers(1)
        with performance_report(filename="dask-report.html"):
            output = processor.run_uproot_job(
                sample_dict,
                treename='Events',
                processor_instance=processor_object,
                executor=processor.dask_executor,
                executor_args={
                    'client': client,
                    # 'skipbadfiles': args.skipbadfiles,
                    'schema': processor.NanoAODSchema,
                    'retries': 3,
                },
Beispiel #15
0
def record_dataset(args):
    if args.resume and not args.overwrite:
        resume_args = yaml.load((Path(args.resume) / 'config.yaml').read_text())
        vars(args).update({k: v for k, v in vars(resume_args).items() if 'resume' not in k})

    args.ds_dir = Path(args.ds_dir)
    if args.ds_dir.is_dir():
        if args.resume:
            assert (args.ds_dir / 'seeds_recorded.txt').exists()
        elif args.overwrite:
            shutil.rmtree(args.ds_dir)
        else:
            raise ValueError('There is already a dataset with this name')
    args.ds_dir.mkdir(exist_ok=True)

    (args.ds_dir / 'config.yaml').write_text(yaml.dump(args))

    log_dir = DASK_LOGS_DIR.as_posix()
    if args.distributed:
        env_extra = [
            'module purge',
            f'source {CONDA_BASE_DIR}/bin/activate',
            f'conda activate {CONDA_ENV}',
            f'cd {PROJECT_DIR}',
            f'eval $(python -m job_runner.assign_gpu)',
            'export OMP_NUM_THREADS=1',
            'export MKL_NUM_THREADS=1',
        ]
        n_processes = args.n_processes_per_gpu
        log_path = (DASK_LOGS_DIR / 'all_logs.out').as_posix()

        cluster = SLURMCluster(cores=n_processes,
                               memory='160 GB',
                               queue=f'{SLURM_GPU_QUEUE}',
                               walltime='10:00:00',
                               processes=n_processes,
                               local_directory=log_dir,
                               log_directory=log_dir,
                               nthreads=1,
                               memory_monitor_interval='1000000000000000s',
                               env_extra=env_extra,
                               job_extra=[
                                   f'--qos={SLURM_QOS}',
                                   '--hint=nomultithread',
                                   '--gres=gpu:1',
                                   f'--output={log_path}',
                                   f'--error={log_path}'
                               ],
                               interface=DASK_NETWORK_INTERFACE)
        cluster.adapt(minimum_jobs=args.n_workers, maximum_jobs=args.n_workers)
    else:
        cluster = LocalCluster(local_directory=log_dir, processes=True, n_workers=4)

    client = Client(cluster)

    all_keys = record_dataset_dask(client=client, ds_dir=args.ds_dir,
                                   scene_kwargs=args.scene_kwargs,
                                   scene_cls=args.scene_cls,
                                   start_seed=0,
                                   n_chunks=int(args.n_chunks),
                                   n_frames_per_chunk=int(args.n_frames_per_chunk),
                                   resume=args.resume)

    n_train = int(args.train_ratio * len(all_keys))
    train_keys, val_keys = all_keys[:n_train], all_keys[n_train:]
    Path(args.ds_dir / 'keys.pkl').write_bytes(pickle.dumps(all_keys))
    Path(args.ds_dir / 'train_keys.pkl').write_bytes(pickle.dumps(train_keys))
    Path(args.ds_dir / 'val_keys.pkl').write_bytes(pickle.dumps(val_keys))

    client.close()
    del cluster
import distributed.joblib
from joblib import Parallel, parallel_backend

from skopt import BayesSearchCV

from molmimic.torch_model.torch_train import Molmimic

cluster = SLURMCluster(walltime="3-00:00:00",
                       memory="12000M",
                       cores=16,
                       project="muragroup",
                       queue="gpu",
                       gres="gpu:p100:1",
                       ntasks="1")
cluster.adapt()
client = Client(cluster)

space = {
    "learning_rate": Real(1e-6, 1e-1, prior='log-uniform'),
    "num_epochs": Integer(30, 500, prior='log-uniform'),
    "batch_size": Integer(1, 30, prior='log-uniform'),
    "dropout_depth": Integer(0, 1),  #boolean
    "dropout_width": Integer(0, 1),  #boolean
    "dropout_p": Real(0., 1., prior='log-uniform')
}

opt = BayesSearchCV(Molmimic(), space, n_iter=100)

with parallel_backend('dask.distributed', client=client):
    opt.fit("default")
Beispiel #17
0
    def run(
        self,
        distributed: bool = False,
        clean: bool = False,
        debug: bool = False,
        **kwargs,
    ):
        """
        Run a flow with your steps.

        Parameters
        ----------
        distributed: bool
            Create a SLURMCluster to use for job distribution.
            Default: False (do not create a cluster)
        clean: bool
            Should the local staging directory be cleaned prior to this run.
            Default: False (Do not clean)
        debug: bool
            A debug flag for the developer to use to manipulate how much data runs,
            how it is processed, etc.
            Default: False (Do not debug)

        Notes
        -----
        Documentation on prefect:
        https://docs.prefect.io/core/

        Basic prefect example:
        https://docs.prefect.io/core/
        """
        # Initalize steps
        raw = steps.MappedRaw()
        invert = steps.MappedInvert()
        cumsum = steps.MappedSum()
        plot = steps.Plot()
        fancyplot = steps.Fancyplot()

        # Choose executor
        if distributed:
            # Log dir settings
            log_dir_name = datetime.now().isoformat().split(".")[0]  # Do not include ms
            log_dir = Path(f".logs/{log_dir_name}/")
            log_dir.mkdir(parents=True)

            # Spawn cluster
            cluster = SLURMCluster(
                cores=2,
                memory="32GB",
                walltime="10:00:00",
                queue="aics_cpu_general",
                local_directory=str(log_dir),
                log_directory=str(log_dir),
            )

            # Set adaptive scaling
            cluster.adapt(minimum_jobs=1, maximum_jobs=40)

        else:
            # Stop conflicts between Dask and OpenBLAS
            # Info here:
            # https://stackoverflow.com/questions/45086246/too-many-memory-regions-error-with-dask
            os.environ["OMP_NUM_THREADS"] = "1"

            # Spawn local cluster
            cluster = LocalCluster()

        # Log bokeh info
        if cluster.dashboard_link:
            log.info(f"Dask UI running at: {cluster.dashboard_link}")

        # Start local dask cluster
        exe = DaskExecutor(cluster.scheduler_address)

        # Configure your flow
        with Flow("example_step_workflow") as flow:
            # If your step utilizes a secondary flow with dask pass the executor address
            # If you want to clean the local staging directories pass clean
            # If you want to utilize some debugging functionality pass debug
            # If you don't utilize any of these, just pass the parameters you need.
            matrices = raw(
                distributed_executor_address=cluster.scheduler_address,
                clean=clean,
                debug=debug,
                **kwargs,  # Allows us to pass `--n {some integer}` or other params
            )
            inversions = invert(
                matrices,
                distributed_executor_address=cluster.scheduler_address,
                clean=clean,
                debug=debug,
            )
            vectors = cumsum(
                inversions,
                distributed_executor_address=cluster.scheduler_address,
                clean=clean,
                debug=debug,
            )
            plot(
                vectors,
                distributed_executor_address=cluster.scheduler_address,
                clean=clean,
                debug=debug,
            )
            fancyplot(
                vectors,
                distributed_executor_address=cluster.scheduler_address,
                clean=clean,
                debug=debug,
            )

        # Run flow and get ending state
        state = flow.run(executor=exe)

        # Get plot location
        log.info(f"Plot stored to: {plot.get_result(state, flow)}")

        # Close cluster
        if distributed:
            cluster.close()
import logging, time
import xarray as xr
from dask.distributed import Client
from typing import List, Optional, Tuple, Dict, Any
from dask_jobqueue import SLURMCluster
logging.basicConfig(format='%(levelname)s:%(message)s', level=logging.DEBUG)

variable = "tas"
uri = 'https://dataserver.nccs.nasa.gov/thredds/dodsC/bypass/CREATE-IP/reanalysis/MERRA2/mon/atmos/tas.ncml'

cluster = SLURMCluster(queue="myNodes")
cluster.adapt(minimum=1, maximum=4, interval="2s", wait_count=500)
print("CLUSTER JOB SCRIPT: " + cluster.job_script())
client = Client(cluster)

t0 = time.time()
dset: xr.Dataset = xr.open_dataset(uri)
da: xr.DataArray = dset['tas']
da2: xr.DataArray = da.groupby('time.month').mean('time')
da_monthly = da2.load()
print(da_monthly)
print(" Completed computation in " + str(time.time() - t0) + " seconds")
client.close()
cluster.close()
## Pb sur occigen les jobs se terminent a cause de workers killed ...

from dask_jobqueue import SLURMCluster 
from dask.distributed import Client 
  
cluster = SLURMCluster(cores=28,name='make_zarr',walltime='00:20:00',job_extra=['--constraint=BDW28','--exclusive','--nodes=1'],memory='20GB') 
print(cluster.job_script()) 
  
cluster.scale(1) 
cluster.adapt(minimum=1, maximum=4) 
  
from dask.distributed import Client 
client = Client(cluster) 
client 

import xarray as xr
import numpy as np
import glob
import time
                                   

data_dir = '/store/CT1/hmg2840/lbrodeau/eNATL60/eNATL60-BLBT02-S/'

tfiles = sorted(glob.glob(data_dir + '*/eNATL60-BLBT02_1h_*_gridT_20090701-20090701.nc'))
sfiles = sorted(glob.glob(data_dir + '*/eNATL60-BLBT02_1h_*_gridS_20090701-20090701.nc'))

def non_time_coords(ds):
    return [v for v in ds.data_vars
            if 'time_counter' not in ds[v].dims]

def drop_non_essential_vars_pop(ds):