def open(self): """Initiate and scale the cluster""" # initiate the cluster object # Look at the ~/.config/dask/mintpy.yaml file for changing the Dask configuration defaults print('initiate Dask cluster') if self.cluster_type == 'local': from dask.distributed import LocalCluster # initiate cluster object self.cluster = LocalCluster() else: # for non-local cluster, import related dask module only when it's needed # because job_queue is not available on macports, which make sense import dask_jobqueue # initiate cluster object if self.cluster_type == 'lsf': self.cluster = dask_jobqueue.LSFCluster(**self.cluster_kwargs) elif self.cluster_type == 'pbs': self.cluster = dask_jobqueue.PBSCluster(**self.cluster_kwargs) elif self.cluster_type == 'slurm': self.cluster = dask_jobqueue.SLURMCluster( **self.cluster_kwargs) else: msg = 'un-recognized input cluster: {}'.format( self.cluster_type) msg += '\nsupported clusters: {}'.format(CLUSTER_LIST) raise ValueError(msg) # show dask cluster job script for reference print("\n", self.cluster.job_script()) # for debug debug_mode = False if debug_mode: with open('dask_command_run_from_python.txt', 'w') as f: f.write(self.cluster.job_script() + '\n') # This line submits num_worker jobs to the cluster to start a bunch of workers # In tests on Pegasus `general` queue in Jan 2019, no more than 40 workers could RUN # at once (other user's jobs gained higher priority in the general at that point) print('scale the cluster to {} workers'.format(self.num_worker)) self.cluster.scale(self.num_worker)
def open(self): """Initiate the cluster""" # initiate the cluster object # Look at the ~/.config/dask/mintpy.yaml file for changing the Dask configuration defaults print('initiate Dask cluster') if self.cluster_type == 'local': from dask.distributed import LocalCluster # initiate cluster object self.cluster = LocalCluster() else: # for non-local cluster, import related dask module only when it's needed # because job_queue is not available on macports, which make sense import dask_jobqueue # initiate cluster object if self.cluster_type == 'lsf': self.cluster = dask_jobqueue.LSFCluster(**self.cluster_kwargs) elif self.cluster_type == 'pbs': self.cluster = dask_jobqueue.PBSCluster(**self.cluster_kwargs) elif self.cluster_type == 'slurm': self.cluster = dask_jobqueue.SLURMCluster( **self.cluster_kwargs) else: msg = 'un-recognized input cluster: {}'.format( self.cluster_type) msg += '\nsupported clusters: {}'.format(CLUSTER_LIST) raise ValueError(msg) # show dask cluster job script for reference print("\n", self.cluster.job_script()) # for debug debug_mode = False if debug_mode: with open('dask_command_run_from_python.txt', 'w') as f: f.write(self.cluster.job_script() + '\n')
type=bool, default=False, help=( "set to True to use a SLURM cluster, False to run locally. If True, make" + " sure you run this script with sbatch, so the scheduler is on the same " + "network as the worker nodes." ), ) args = parser.parse_args() if args.use_slurm: cluster = dask_jobqueue.SLURMCluster( cores=4, processes=4, memory="2GB", walltime="0:00:05", queue="all", local_directory="/tmp/", interfacestr="em2", ) cluster.scale(16) # Ask the cluster for 16 worker processes, wait until arrival # Print a link to the HTTP diagnostics server print("Dashboard link: {0}".format(cluster.dashboard_link)) else: cluster = dask.distributed.LocalCluster(processes=False, dashboard_address=None) client = dask.distributed.Client(cluster) tuner = hyperband.Hyperband( get_hyperparameter_configuration, run_then_return_val_loss,
opts = ga_opts() # perform safe checks prior to any calculation safe_checks() # clean the working directory if opts['continue'] == '1': clean() # create SLURM Cluster if available; if not, use multiprocessing slurm = os.getenv('SLURM_JOB_PARTITION', None) if slurm != None: cluster = dask_jobqueue.SLURMCluster( queue=os.environ['SLURM_JOB_PARTITION'], cores=1, walltime='0', memory=opts['memory'], local_directory=os.getenv('TMPDIR', '/tmp')) cluster.adapt(minimum_jobs=opts['min'], maximum_jobs=opts['max']) client = Client(cluster) else: cluster = LocalCluster(n_workers=opts['max'], processes=True, threads_per_worker=1, local_directory=os.getenv('TMPDIR', '/tmp')) client = Client(cluster) # read model configuration parameters = configurate()
# perform safe checks prior to any calculation safe_checks() # clean the working directory clean() # create SLURM Cluster if available try: slurm = os.environ['SLURM_JOB_PARTITION'] except: slurm = None if slurm: cluster = dask_jobqueue.SLURMCluster( queue=os.environ['SLURM_JOB_PARTITION'], cores=1, memory=os.environ['SLURM_MEM_PER_CPU'], local_directory=os.getenv('TMPDIR', '/tmp')) client = Client(cluster) cluster.start_workers(opts['ntasks']) # read model configuration parameters = configurate() # Sterope Main Algorithm population = populate() # simulate levels simulate() # evaluate sensitivity sensitivity = evaluate()