def train_on_jz_dask(job_name, train_function, *args, **kwargs):
    cluster = SLURMCluster(
        cores=1,
        job_cpu=20,
        memory='80GB',
        job_name=job_name,
        walltime='60:00:00',
        interface='ib0',
        job_extra=[
            f'--gres=gpu:1',
            '--qos=qos_gpu-t4',
            '--distribution=block:block',
            '--hint=nomultithread',
            '--output=%x_%j.out',
        ],
        env_extra=[
            'cd $WORK/fastmri-reproducible-benchmark',
            '. ./submission_scripts_jean_zay/env_config.sh',
        ],
    )
    cluster.scale(1)

    print(cluster.job_script())

    client = Client(cluster)
    futures = client.submit(
        # function to execute
        train_function,
        *args,
        **kwargs,
        # this function has potential side effects
        pure=True,
    )
    client.gather(futures)
    print('Shutting down dask workers')
def train_on_jz_dask(job_name, train_function, *args, **kwargs):
    cluster = SLURMCluster(
        cores=1,
        job_cpu=40,
        memory='80GB',
        job_name=job_name,
        walltime='20:00:00',
        interface='ib0',
        job_extra=[
            f'--gres=gpu:4',
            '--qos=qos_gpu-t3',
            '--distribution=block:block',
            '--hint=nomultithread',
            '--output=%x_%j.out',
        ],
        env_extra=[
            'cd $WORK/understanding-unets',
            '. ./submission_scripts_jean_zay/env_config.sh',
        ],
    )
    cluster.scale(1)

    print(cluster.job_script())

    client = Client(cluster)
    futures = client.submit(
        # function to execute
        train_function,
        *args,
        **kwargs,
        # this function has potential side effects
        pure=True,
    )
    run_id = client.gather(futures)
    print(f'Train run id: {run_id}')
Beispiel #3
0
def run_HPC():
        
    #################
    # Setup dask cluster
    #################
    
    config = utils.read_config()
    num_workers = config["num_hipergator_workers"]
    
    #job args
    extra_args=[
        "--error=/home/b.weinstein/logs/dask-worker-%j.err",
        "--account=ewhite",
        "--output=/home/b.weinstein/logs/dask-worker-%j.out"
    ]
    
    cluster = SLURMCluster(
        processes=2,
        queue='hpg2-compute',
        cores=3, 
        memory='11GB', 
        walltime='24:00:00',
        job_extra=extra_args,
        local_directory="/home/b.weinstein/logs/", death_timeout=150)
    
    print(cluster.job_script())
    cluster.adapt(minimum=num_workers, maximum=num_workers)
    
    dask_client = Client(cluster)
    
    #Start dask
    dask_client.run_on_scheduler(start_tunnel)  
    run(config, debug=False)
def start_dask_cluster(number_of_workers, mem_size="10GB"):

    #################
    # Setup dask cluster
    #################

    #job args
    extra_args = [
        "--error=/home/b.weinstein/logs/dask-worker-%j.err",
        "--account=ewhite",
        "--output=/home/b.weinstein/logs/dask-worker-%j.out"
    ]

    cluster = SLURMCluster(processes=1,
                           queue='hpg2-compute',
                           cores=1,
                           memory=mem_size,
                           walltime='24:00:00',
                           job_extra=extra_args,
                           local_directory="/home/b.weinstein/logs/dask/",
                           death_timeout=300)

    print(cluster.job_script())
    cluster.adapt(minimum=number_of_workers, maximum=number_of_workers)

    dask_client = Client(cluster)

    #Start dask
    dask_client.run_on_scheduler(start_tunnel)

    return dask_client
Beispiel #5
0
def initialize_dask(n, factor = 5, slurm = False):

    if not slurm:
        cores =  len(os.sched_getaffinity(0))
        cluster = distributed.LocalCluster(processes = False,
                                           n_workers = 1,
                                           threads_per_worker = 1)

    else:
        n = min(100, n)
        py = './enter_conda.sh python3'
        params = {
            'python' : py,
            'cores' : 1,
            'memory' : '512MB',
            'walltime' : '180',
            'processes' : 1,
            'job_extra' : [
                '--qos use-everything',
                '--array 0-{0:d}'.format(n - 1),
                '--requeue',
                '--output "/dev/null"'
            ],
            'env_extra' : [
                'JOB_ID=${SLURM_ARRAY_JOB_ID%;*}_${SLURM_ARRAY_TASK_ID%;*}',
                'source /etc/profile.d/modules.sh',
                'cd {0!s}'.format(CONFIG['PATHS', 'root']),
            ]
        }
        cluster = SLURMCluster(**params)
        print(cluster.job_script())
        cluster.scale(1)

    print(cluster.dashboard_link)
    return distributed.Client(cluster)
def start(cpus=0, gpus=0, mem_size="10GB"):
    #################
    # Setup dask cluster
    #################

    if cpus > 0:
        #job args
        extra_args = [
            "--error=/orange/idtrees-collab/logs/dask-worker-%j.err",
            "--account=ewhite",
            "--output=/orange/idtrees-collab/logs/dask-worker-%j.out"
        ]

        cluster = SLURMCluster(
            processes=1,
            queue='hpg2-compute',
            cores=1,
            memory=mem_size,
            walltime='24:00:00',
            job_extra=extra_args,
            extra=['--resources cpu=1'],
            scheduler_options={"dashboard_address": ":8781"},
            local_directory="/orange/idtrees-collab/tmp/",
            death_timeout=300)

        print(cluster.job_script())
        cluster.scale(cpus)

    if gpus:
        #job args
        extra_args = [
            "--error=/orange/idtrees-collab/logs/dask-worker-%j.err",
            "--account=ewhite",
            "--output=/orange/idtrees-collab/logs/dask-worker-%j.out",
            "--partition=gpu", "--gpus=1"
        ]

        cluster = SLURMCluster(
            processes=1,
            cores=1,
            memory=mem_size,
            walltime='24:00:00',
            job_extra=extra_args,
            extra=['--resources gpu=1'],
            scheduler_options={"dashboard_address": ":8787"},
            local_directory="/orange/idtrees-collab/tmp/",
            death_timeout=300)

        cluster.scale(gpus)

    dask_client = Client(cluster)

    #Start dask
    dask_client.run_on_scheduler(start_tunnel)

    return dask_client
def evaluate_pdnet_sense_dask(run_id, contrast, af, n_iter,
                              cuda_visible_devices, n_samples):
    job_name = f'evaluate_pdnet_sense_{af}'
    if contrast is not None:
        job_name += f'_{contrast}'

    cluster = SLURMCluster(
        cores=1,
        job_cpu=40,
        memory='160GB',
        job_name=job_name,
        walltime='20:00:00',
        interface='ib0',
        job_extra=[
            f'--gres=gpu:4',
            '--qos=qos_gpu-t3',
            '--distribution=block:block',
            '--hint=nomultithread',
            '--output=%x_%j.out',
        ],
        env_extra=[
            'cd $WORK/fastmri-reproducible-benchmark',
            '. ./submission_scripts_jean_zay/env_config.sh',
        ],
    )
    cluster.scale(1)

    print(cluster.job_script())

    client = Client(cluster)
    futures = client.submit(
        # function to execute
        evaluate_pdnet_sense,
        # *args
        run_id,
        contrast,
        int(af),
        n_iter,
        n_samples,
        cuda_visible_devices,
        # this function has potential side effects
        pure=True,
    )
    metrics_names, eval_res = client.gather(futures)
    print(metrics_names)
    print(eval_res)
    print('Shutting down dask workers')
Beispiel #8
0
def run_HPC(data_paths):

    #################
    # Setup dask cluster
    #################

    from dask_jobqueue import SLURMCluster
    from dask.distributed import Client, wait

    DeepForest_config = config.load_config()
    num_workers = DeepForest_config["num_hipergator_workers"]

    #job args
    extra_args = [
        "--error=/home/b.weinstein/logs/dask-worker-%j.err",
        "--account=ewhite",
        "--output=/home/b.weinstein/logs/dask-worker-%j.out"
    ]

    cluster = SLURMCluster(processes=1,
                           queue='hpg2-compute',
                           cores=1,
                           memory='13GB',
                           walltime='24:00:00',
                           job_extra=extra_args,
                           local_directory="/home/b.weinstein/logs/",
                           death_timeout=300)

    print(cluster.job_script())
    cluster.adapt(minimum=num_workers, maximum=num_workers)

    dask_client = Client(cluster)

    #Start dask
    dask_client.run_on_scheduler(start_tunnel)

    for site in data_paths:
        futures = dask_client.map(Generate.run,
                                  data_paths[site],
                                  site=site,
                                  DeepForest_config=DeepForest_config)
        wait(futures)
        print("{} complete".format(site))

    print("All sites complete")
def launch_dask_tasks(batch_sizes, save):
    job_name = 'dask_mnist_tf_example'

    cluster = SLURMCluster(
        cores=1,
        job_cpu=10,
        memory='10GB',
        job_name=job_name,
        walltime='1:00:00',
        interface='ib0',
        job_extra=[
            f'--gres=gpu:1',
            '--qos=qos_gpu-dev',
            '--distribution=block:block',
            '--hint=nomultithread',
            '--output=%x_%j.out',
        ],
    )
    n_jobs = len(batch_sizes)
    cluster.scale(jobs=n_jobs)
    print(cluster.job_script())

    client = Client(cluster)
    futures = [
        client.submit(
            # function to execute
            train_dense_model,
            # *args
            None,
            save,
            batch_size,
            # this function has potential side effects
            pure=not save,
        ) for batch_size in batch_sizes
    ]
    job_result = client.gather(futures)
    if all(job_result):
        print('All jobs finished without errors')
    else:
        print('One job errored out')
    print('Shutting down dask workers')
def eval_on_jz_dask(job_name, eval_function, *args, **kwargs):
    cluster = SLURMCluster(
        cores=1,
        job_cpu=40,
        memory='80GB',
        job_name=job_name,
        walltime='20:00:00',
        interface='ib0',
        job_extra=[
            # for now we can't use 4 GPUs because of
            # https://github.com/tensorflow/tensorflow/issues/39268
            f'--gres=gpu:1',
            '--qos=qos_gpu-t3',
            '--distribution=block:block',
            '--hint=nomultithread',
            '--output=%x_%j.out',
        ],
        env_extra=[
            'cd $WORK/fastmri-reproducible-benchmark',
            '. ./submission_scripts_jean_zay/env_config.sh',
        ],
    )
    cluster.scale(1)

    print(cluster.job_script())

    client = Client(cluster)
    futures = client.submit(
        # function to execute
        eval_function,
        *args,
        **kwargs,
        # this function has potential side effects
        pure=True,
    )
    metrics_names, eval_res = client.gather(futures)
    print(metrics_names)
    print(eval_res)
    print('Shutting down dask workers')
Beispiel #11
0
def get_slurm_dask_client(n_workers, n_cores, n_processes):

    cluster = SLURMCluster(cores=n_cores,
                           processes=n_processes,
                           memory='80GB',
                           interface='ib0',
                           queue='standard',
                           job_extra=['-e slurm-%j.err', '-o slurm-%j.out',
                                      '--time=72:00:00 --requeue'])

    header_lines = cluster.job_header.split('\n')
    mem_pos = find_mem_pos(header_lines)
    header_lines = header_lines[:mem_pos]+header_lines[mem_pos+1:]
    cluster.job_header = '\n'.join(header_lines)
    print(cluster.job_script())
    # Scale cluster to n_workers
    cluster.scale(n_workers)
    # Wait for cluster to start
    time.sleep(30)
    client = Client(cluster)
    print(client.scheduler_info())

    return client
import logging, time
import xarray as xr
from dask.distributed import Client
from typing import List, Optional, Tuple, Dict, Any
from dask_jobqueue import SLURMCluster
logging.basicConfig(format='%(levelname)s:%(message)s', level=logging.DEBUG)

variable = "tas"
uri = 'https://dataserver.nccs.nasa.gov/thredds/dodsC/bypass/CREATE-IP/reanalysis/MERRA2/mon/atmos/tas.ncml'

cluster = SLURMCluster(queue="myNodes")
cluster.adapt(minimum=1, maximum=4, interval="2s", wait_count=500)
print("CLUSTER JOB SCRIPT: " + cluster.job_script())
client = Client(cluster)

t0 = time.time()
dset: xr.Dataset = xr.open_dataset(uri)
da: xr.DataArray = dset['tas']
da2: xr.DataArray = da.groupby('time.month').mean('time')
da_monthly = da2.load()
print(da_monthly)
print(" Completed computation in " + str(time.time() - t0) + " seconds")
client.close()
cluster.close()
Beispiel #13
0
def main():
    parser = argparse.ArgumentParser(
        description = 'Simple example for using dask-joqueue in SLURM')

    parser.add_argument('--proc_per_job', type = int, default = 1,
                        help = 'Number of processes per job.')
    parser.add_argument('--cores_per_proc', type = float, default = 2,
                        help = 'Number of cores per process.')
    parser.add_argument('--n_jobs', type = int, default = 1,
                        help = 'Number of jobs')
    parser.add_argument('--array', type = int, default = 0,
                        help = 'EXPERIMENTAL. If >0, then submit an job-array '+\
                        'of this size. The total number of jobs will'+\
                        ' be `array * n_jobs`.')
    parser.add_argument('--container', type = str,
                        help = 'Path to singularity container. If `None`, '+\
                        'then assumes conda environment.')
    parser.add_argument('--qos', type = str, help = 'QOS to use.')
    parser.add_argument('--dry', action = 'store_true',
                        help = 'Print job script and exit (no submission)')
    parser.add_argument('--load', type = int, default = 1000,
                        help = 'Load for the function.')
    args = parser.parse_args()

    n_procs = args.proc_per_job * args.n_jobs

    params = {
        'cores' : int(args.cores_per_proc * args.proc_per_job),
        'memory' : '{0:d}00MB'.format(args.proc_per_job*5),
        'processes' : args.proc_per_job,
        # The name to assign to each worker
        'name' : 'dask_test'
    }

    job_extra = ['--requeue']
    env_extra = []

    if not args.qos is None:
        job_extra.append('--qos {}'.format(args.qos))

    if args.array > 0:
        n_procs = n_procs * args.array
        job_extra.append('--array 0-{0:d}'.format(args.array - 1))
        """
        This is added to ensure that each worker has a unique ID.
        This may be unnecessary.
        """
        env_extra.append(
            'JOB_ID=${SLURM_ARRAY_JOB_ID%;*}_${SLURM_ARRAY_TASK_ID%;*}')

    if not args.container is None:
        """
        When using a  container, dask needs to know how to enter the python
        environment.

        Note:
        The binding `-B..` is cluster(OpenMind) specific but can generalize.
        The binding is required since `singularity` will not bind by default.
        """
        cont = os.path.normpath(args.container)
        bind = cont.split(os.sep)[1]
        bind = '-B /{0!s}:/{0!s}'.format(bind)
        py = 'singularity exec {0!s} {1!s} python3'.format(bind, cont)
        params.update({'python' : py})
        """
        Dask will generate a job script but some elements will be missing
        due to the way the singularity container with interface with slurm.
        The `modules` need to initialized and `singularity` needs to be added.
        """
        env_extra += [ 'source /etc/profile.d/modules.sh',
        'module add openmind/singularity/2.6.0']

    params.update({ 'job_extra' : job_extra,
                    'env_extra' : env_extra})

    cluster = SLURMCluster(**params)
    """
    Display the job script.
    """
    print(cluster.job_script())
    pprint(params)

    t0 = time.time()
    num_crunch(100)
    expected_dur = (time.time() - t0) * args.load
    print('Expected time of linear call: {0:f}'.format(expected_dur))

    if args.dry:
        return

    """
    Scale the cluster to the number of jobs.
    """
    print('Scaling by {}'.format(args.n_jobs))
    cluster.scale_up(args.proc_per_job * args.n_jobs)

    """
    Setup a client that interfaces with the workers
    """
    client = distributed.Client(cluster)
    time.sleep(10)
    print(cluster)
    print(client)
    pprint(client.has_what())
    # pprint(client.scheduler_info())
    """
    Generate a transaction.
    """
    futures = client.map(num_crunch, range(args.load))
    t0 = time.time()

    """
    Compute (and then discard) while keeping track of progress.
    """
    distributed.progress(futures)
    dur = time.time() - t0
    msg = '\n\nSpeed up of {0:f}x ({1:f}/{2:f})'.format((expected_dur / dur),
                                                    expected_dur, dur)
    print(msg)
    msg = 'Ideal speed up is {0:f}x'.format(n_procs)
    print(msg)
    """
sys.path.insert(0, '/home/albert7a/git/xscale')
import xscale

from dask_jobqueue import SLURMCluster
from dask.distributed import Client

cluster = SLURMCluster(cores=1,
                       name='make_profiles',
                       walltime='00:30:00',
                       job_extra=[
                           '--constraint=HSW24', '--exclusive', '--nodes=1',
                           '--ntasks-per-node=24'
                       ],
                       memory='120GB',
                       interface='ib0')
print(cluster.job_script())

cluster.scale(240)

from dask.distributed import Client
client = Client(cluster)

nb_workers = 0
while True:
    nb_workers = len(client.scheduler_info()["workers"])
    if nb_workers >= 2:
        break
    time.sleep(1)

data_dir = '/store/CT1/hmg2840/lbrodeau/eNATL60/eNATL60-BLBT02-S/'
gridfile = '/store/CT1/hmg2840/lbrodeau/eNATL60/eNATL60-I/mesh_mask_eNATL60_3.6_lev1.nc4'
Beispiel #15
0
def main():

    # If multiprocessing, set cluster and job settings
    if cfg.MULTI_PHRASE_CNCPTS:

        cluster = SLURMCluster(
            queue='main',
            cores=8,
            memory="36000 MB",
            processes=7,
            job_extra=[
                '--job-name=extract_concepts', '--mail-type=NONE',
                '-e /home/students/suter/logs_suter/worker_logs/slurm-%j.err',
                '-o /home/students/suter/logs_suter/worker_logs/slurm-%j.out',
                '--time=24:00:00'
            ],
            env_extra=['PATH=/opt/slurm/bin:$PATH'])

    # Load files to process
    with open(cfg.FILES_TO_USE) as infile:
        files_to_use = infile.read().split('\n')

    # Load captions
    with open(cfg.DATA_DIR + "flickr30k-captions/results_20130124.token",
              "r") as infile:
        data = infile.read()
        caption_data = data.split('\n')

    # Load files that are already processed
    processed_files_dir = [
        f for f in os.listdir('data/phrase_concepts/') if f.endswith('.pickle')
    ]
    processed_samples = []

    # Save files that are already processed
    for file in processed_files_dir:
        parts = file.split('_')
        image_ID = parts[-2]
        caption_ID = parts[-1].strip('.pickle')
        processed_samples.append((image_ID, caption_ID))

    print(len(processed_samples))

    # Load dataset and get captions and entities
    dataset = sg.Dataset(cfg.DATA_DIR)
    dataset.get_captions()
    dataset.get_entities()

    # Cluster settigs
    if cfg.MULTI_PHRASE_CNCPTS:
        cluster.scale(jobs=15)
        print(cluster.job_script())
        client = Client(cluster)

    # Initalize
    futures = []
    chunk_counter = 0

    # For each sample
    for line in caption_data:

        # Skip if empty
        if line.strip() == '':
            continue

    # Get ID and sent
        split_line = line.split('\t')
        ID = split_line[0]
        sent = split_line[1]

        # Get image and caption ID
        split_ID = ID.split('#')
        image_ID = split_ID[0].strip('.jpg')
        caption_ID = split_ID[1]

        print(image_ID, caption_ID)

        # Skip if already processed
        if (image_ID, caption_ID) in processed_samples:
            continue

        # Skip if not in "files to process"
        if image_ID not in files_to_use:
            continue

        # Multiprocessing
        if cfg.MULTI_PHRASE_CNCPTS:

            # Count multi-processing chunks
            chunk_counter += 1

            # Run: get phrase concepts for given sample
            future = client.submit(get_phrase_concepts, line, dataset,
                                   cfg.BBOXES_DIR)
            futures.append(future)

            # Gather after 50 samples
            if chunk_counter >= 20:
                client.gather(futures)
                futures = []
                chunk_counter = 0

        # Without multi-processing
        else:
            get_phrase_concepts(line, dataset, cfg.BBOXES_DIR)

    # Final gathering of results
    if cfg.MULTI_PHRASE_CNCPTS:
        client.gather(futures)

    # Load all phrase concept files
    phrase_concepts_files = [
        f for f in os.listdir('data/phrase_concepts/') if f.endswith('.txt')
    ]

    # Write it all out to one file
    with open('data/phrase_concepts.txt', 'w') as outfile:
        outfile.write('')

        # For each file, load concent and copy to new file
        for file in phrase_concepts_files:

            # Load content
            with open('data/phrase_concepts/' + file, 'r') as infile:
                line = infile.read()
                line = line.split('\n')[0]

                # Write out to new file
                with open('data/phrase_concepts.txt', 'a') as outfile:
                    outfile.write(line)
                    outfile.write('\n')