def test_basic(loop): with HTCondorCluster(cores=1, memory="100MB", disk="100MB", loop=loop) as cluster: with Client(cluster) as client: cluster.scale(2) start = time() while not (cluster.pending_jobs or cluster.running_jobs): sleep(0.100) assert time() < start + QUEUE_WAIT future = client.submit(lambda x: x + 1, 10) assert future.result(QUEUE_WAIT) == 11 assert cluster.running_jobs workers = list(client.scheduler_info()["workers"].values()) w = workers[0] assert w["memory_limit"] == 1e8 assert w["nthreads"] == 1 cluster.scale(0) start = time() while cluster.running_jobs: sleep(0.100) assert time() < start + QUEUE_WAIT
def test_header(): with HTCondorCluster(cores=1, memory="100MB", disk="100MB") as cluster: assert cluster._dummy_job.job_header_dict["MY.DaskWorkerCores"] == 1 assert cluster._dummy_job.job_header_dict[ "MY.DaskWorkerDisk"] == 100000000 assert cluster._dummy_job.job_header_dict[ "MY.DaskWorkerMemory"] == 100000000
def test_job_script(): with HTCondorCluster( cores=4, processes=2, memory="100MB", disk="100MB", env_extra=[ 'export LANG="en_US.utf8"', 'export LC_ALL="en_US.utf8"' ], job_extra={"+Extra": "True"}, ) as cluster: job_script = cluster.job_script() assert "RequestCpus = MY.DaskWorkerCores" in job_script assert "RequestDisk = floor(MY.DaskWorkerDisk / 1024)" in job_script assert "RequestMemory = floor(MY.DaskWorkerMemory / 1048576)" in job_script assert "MY.DaskWorkerCores = 4" in job_script assert "MY.DaskWorkerDisk = 100000000" in job_script assert "MY.DaskWorkerMemory = 100000000" in job_script assert 'MY.JobId = "$(ClusterId).$(ProcId)"' in job_script assert "LANG=en_US.utf8" in job_script assert "LC_ALL=en_US.utf8" in job_script assert "export" not in job_script assert "+Extra = True" in job_script assert ("{} -m distributed.cli.dask_worker tcp://".format( sys.executable) in job_script) assert "--memory-limit 50.00MB" in job_script assert "--nthreads 2" in job_script assert "--nprocs 2" in job_script
def test_job_script(): with HTCondorCluster(cores=4, processes=2, memory='100MB', disk='100MB', env_extra=[ 'export LANG="en_US.utf8"', 'export LC_ALL="en_US.utf8"' ], job_extra={'+Extra': "True"}) as cluster: job_script = cluster.job_script() assert 'RequestCpus = MY.DaskWorkerCores' in job_script assert 'RequestDisk = floor(MY.DaskWorkerDisk / 1024)' in job_script assert 'RequestMemory = floor(MY.DaskWorkerMemory / 1048576)' in job_script assert 'MY.DaskWorkerCores = 4' in job_script assert 'MY.DaskWorkerDisk = 100000000' in job_script assert 'MY.DaskWorkerMemory = 100000000' in job_script assert 'MY.JobId = "$(ClusterId).$(ProcId)"' in job_script assert 'LANG=en_US.utf8' in job_script assert 'LC_ALL=en_US.utf8' in job_script assert 'JOB_ID=$F(MY.JobId)' in job_script assert 'export' not in job_script assert '+Extra = True' in job_script assert '{} -m distributed.cli.dask_worker tcp://'.format( sys.executable) in job_script assert '--memory-limit 50.00MB' in job_script assert '--nthreads 2' in job_script assert '--nprocs 2' in job_script
def test_extra_args_broken_cancel(loop): with HTCondorCluster( cores=1, memory="100MB", disk="100MB", loop=loop, cancel_command_extra=["-name", "wrong.docker"], ) as cluster: with Client(cluster) as client: cluster.scale(2) client.wait_for_workers(2) workers = Job._call(["condor_q", "-af", "jobpid"]).strip() assert workers, "we got dask workers" cluster.scale(0) start = time() while client.scheduler_info()["workers"]: sleep(0.100) workers = Job._call(["condor_q", "-af", "jobpid"]).strip() assert workers, "killing workers with broken cancel_command didn't fail" if time() > start + QUEUE_WAIT // 3: return
def test_config_name_htcondor_takes_custom_config(): conf = { "cores": 1, "memory": "120 MB", "disk": "120 MB", "job-extra": [], "name": "myname", "processes": 1, "interface": None, "death-timeout": None, "extra": [], "env-extra": [], "log-directory": None, "shebang": "#!/usr/bin/env condor_submit", "local-directory": "/tmp", } with dask.config.set({"jobqueue.htcondor-config-name": conf}): with HTCondorCluster(config_name="htcondor-config-name") as cluster: assert cluster.job_name == "myname"
def test_config_name_htcondor_takes_custom_config(): conf = { 'cores': 1, 'memory': '120 MB', 'disk': '120 MB', 'job-extra': [], 'name': 'myname', 'processes': 1, 'interface': None, 'death-timeout': None, 'extra': [], 'env-extra': [], 'log-directory': None, 'shebang': "#!/usr/bin/env condor_submit", 'local-directory': "/tmp", } with dask.config.set({'jobqueue.htcondor-config-name': conf}): with HTCondorCluster(config_name='htcondor-config-name') as cluster: assert cluster.name == 'myname'
def main(): n_port = 8786 with HTCondorCluster(cores=1, memory='100MB', disk='100MB', death_timeout='60', nanny=True, scheduler_options={ 'port': n_port, 'host': socket.gethostname() }, job_extra={ 'should_transfer_files': 'Yes', 'when_to_transfer_output': 'ON_EXIT' }, extra=['--worker-port {}'.format(n_port)]) as cluster: with Client(cluster) as client: cluster.scale(1) future = client.submit(lambda x: x + 1, 10) print('Result is {}'.format(future.result()))
def main(args): base_dir = args.base_dir n_workers = args.n_workers input_files = [base_dir + "/" + fl for fl in os.listdir(base_dir)] print("Found {} files in {}".format(len(input_files), base_dir)) start = time() n_port = 8786 with HTCondorCluster(cores=1, memory="500MB", disk="500MB", death_timeout="60", nanny=False, scheduler_options={ "port": n_port, "host": socket.gethostname() }, job_extra={ "should_transfer_files": "Yes", "when_to_transfer_output": "ON_EXIT", "+JobFlavour": "espresso" }, extra=["--worker-port {}".format(n_port)]) as cluster: #print(cluster.job_script()) with Client(cluster) as client: cluster.scale(n_workers) futures = client.map(dummy_extractor, input_files) arrays = client.gather(futures) end = time() final_arr = ak.concatenate(arrays) print("Done concatenating") print(final_arr.type) print("Computation time: {}".format( str(timedelta(seconds=int(end - start)))))
def test_job_script(): with HTCondorCluster( cores=4, processes=2, memory="100MB", disk="100MB", env_extra=[ 'export LANG="en_US.utf8"', 'export LC_ALL="en_US.utf8"' ], job_extra={"+Extra": "True"}, submit_command_extra=["-verbose"], cancel_command_extra=["-forcex"], ) as cluster: job_script = cluster.job_script() assert "RequestCpus = MY.DaskWorkerCores" in job_script assert "RequestDisk = floor(MY.DaskWorkerDisk / 1024)" in job_script assert "RequestMemory = floor(MY.DaskWorkerMemory / 1048576)" in job_script assert "MY.DaskWorkerCores = 4" in job_script assert "MY.DaskWorkerDisk = 100000000" in job_script assert "MY.DaskWorkerMemory = 100000000" in job_script assert 'MY.JobId = "$(ClusterId).$(ProcId)"' in job_script assert "LANG=en_US.utf8" in job_script assert "LC_ALL=en_US.utf8" in job_script assert "export" not in job_script assert "+Extra = True" in job_script assert re.search(r"condor_submit\s.*-verbose", cluster._dummy_job.submit_command) assert re.search(r"condor_rm\s.*-forcex", cluster._dummy_job.cancel_command) assert ("{} -m distributed.cli.dask_worker tcp://".format( sys.executable) in job_script) formatted_bytes = format_bytes(parse_bytes("50MB")).replace(" ", "") assert f"--memory-limit {formatted_bytes}" in job_script assert "--nthreads 2" in job_script assert "--nprocs 2" in job_script
require_encryption=True) HTCondorJob.submit_command = "condor_submit -spool" cluster = HTCondorCluster( cores=4, memory="2GB", disk="1GB", log_directory="logs", silence_logs="debug", scheduler_options={ "dashboard_address": "8786", "port": 8787, "external_address": "129.93.183.33:8787" }, # HTCondor submit script job_extra={ "universe": "docker", # To be used with coffea-casa:0.1.11 "encrypt_input_files": "/etc/cmsaf-secrets/xcache_token", #"docker_network_type": "host", "docker_image": "coffeateam/coffea-casa-analysis:0.1.11", "container_service_names": "dask", "dask_container_port": "8787", "should_transfer_files": "YES", "when_to_transfer_output": "ON_EXIT", "+DaskSchedulerAddress": '"129.93.183.33:8787"', }) cluster.adapt(minimum_jobs=5, maximum_jobs=100, maximum_memory="4 GB" ) # auto-scale between 5 and 100 jobs (maximum_memory="4 GB")
from tdub import setup_logging from tdub.train import prepare_from_root from tdub.utils import get_selection, get_features, quick_files import lightgbm as lgbm from sklearn.model_selection import train_test_split from dask_jobqueue import HTCondorCluster from dask.distributed import Client from dask_ml.model_selection import GridSearchCV cluster = HTCondorCluster(cores=2, disk="4GB", memory="8GB") client = Client(cluster) cluster.adapt(maximum_jobs=200) setup_logging() qf = quick_files("/atlasgpfs01/usatlas/data/ddavis/wtloop/v29_20191111") df, y, w = prepare_from_root(qf["tW_DR"], qf["ttbar"], "1j1b") X_train, X_test, y_train, y_test, w_train, w_test = train_test_split( df, y, w, train_size=0.8, random_state=414, shuffle=True) n_sig = y_train[y_train == 1].shape[0] n_bkg = y_train[y_train == 0].shape[0] spw = n_bkg / n_sig n_sig = y[y == 1].shape[0] n_bkg = y[y == 0].shape[0] spw = n_bkg / n_sig
def test_header(): with HTCondorCluster(cores=1, memory='100MB', disk='100MB') as cluster: assert cluster.job_header_dict['MY.DaskWorkerCores'] == 1 assert cluster.job_header_dict['MY.DaskWorkerDisk'] == 100000000 assert cluster.job_header_dict['MY.DaskWorkerMemory'] == 100000000
if not check_port(8786): raise RuntimeError( "Port '8786' is not occupied on this node. Try another one." ) import socket cluster = HTCondorCluster( cores=1, memory='2GB', # hardcoded disk='1GB', death_timeout='60', nanny=False, scheduler_options={ 'port': n_port, 'host': socket.gethostname() }, job_extra={ 'log': 'dask_job_output.log', 'output': 'dask_job_output.out', 'error': 'dask_job_output.err', 'should_transfer_files': 'Yes', 'when_to_transfer_output': 'ON_EXIT', '+JobFlavour': '"workday"', }, extra=['--worker-port {}'.format(n_port)], env_extra=env_extra, ) elif 'slurm' in args.executor: cluster = SLURMCluster( queue='all', cores=args.workers, processes=args.workers,
from dask.distributed import performance_report if 'slurm' in args.executor: cluster = SLURMCluster( queue='all', cores=args.workers, processes=args.workers, memory="200 GB", retries=10, walltime='00:30:00', env_extra=env_extra, ) elif 'condor' in args.executor: cluster = HTCondorCluster( cores=args.workers, memory='4GB', disk='4GB', env_extra=env_extra, ) if args.executor == 'dask/casa': client = Client("tls://localhost:8786") # client = Client("tls://andrzej-2enovak-40cern-2ech.dask.coffea.casa:8786") # import shutil # shutil.make_archive("workflows", "zip", base_dir="workflows") # client.upload_file("workflows.zip") from distributed.diagnostics.plugin import UploadDirectory # client.register_worker_plugin(UploadDirectory("workflows", restart=True, update_path=True), nanny=True) client.register_worker_plugin(UploadDirectory("workflows", restart=False, update_path=True), nanny=True)