Example #1
0
def test_job_script():
    with HTCondorCluster(
            cores=4,
            processes=2,
            memory="100MB",
            disk="100MB",
            env_extra=[
                'export LANG="en_US.utf8"', 'export LC_ALL="en_US.utf8"'
            ],
            job_extra={"+Extra": "True"},
    ) as cluster:
        job_script = cluster.job_script()
        assert "RequestCpus = MY.DaskWorkerCores" in job_script
        assert "RequestDisk = floor(MY.DaskWorkerDisk / 1024)" in job_script
        assert "RequestMemory = floor(MY.DaskWorkerMemory / 1048576)" in job_script
        assert "MY.DaskWorkerCores = 4" in job_script
        assert "MY.DaskWorkerDisk = 100000000" in job_script
        assert "MY.DaskWorkerMemory = 100000000" in job_script
        assert 'MY.JobId = "$(ClusterId).$(ProcId)"' in job_script
        assert "LANG=en_US.utf8" in job_script
        assert "LC_ALL=en_US.utf8" in job_script
        assert "export" not in job_script
        assert "+Extra = True" in job_script

        assert ("{} -m distributed.cli.dask_worker tcp://".format(
            sys.executable) in job_script)
        assert "--memory-limit 50.00MB" in job_script
        assert "--nthreads 2" in job_script
        assert "--nprocs 2" in job_script
Example #2
0
def test_basic(loop):
    with HTCondorCluster(cores=1, memory="100MB", disk="100MB",
                         loop=loop) as cluster:
        with Client(cluster) as client:

            cluster.scale(2)

            start = time()
            while not (cluster.pending_jobs or cluster.running_jobs):
                sleep(0.100)
                assert time() < start + QUEUE_WAIT

            future = client.submit(lambda x: x + 1, 10)
            assert future.result(QUEUE_WAIT) == 11
            assert cluster.running_jobs

            workers = list(client.scheduler_info()["workers"].values())
            w = workers[0]
            assert w["memory_limit"] == 1e8
            assert w["nthreads"] == 1

            cluster.scale(0)

            start = time()
            while cluster.running_jobs:
                sleep(0.100)
                assert time() < start + QUEUE_WAIT
Example #3
0
def test_extra_args_broken_cancel(loop):
    with HTCondorCluster(
            cores=1,
            memory="100MB",
            disk="100MB",
            loop=loop,
            cancel_command_extra=["-name", "wrong.docker"],
    ) as cluster:
        with Client(cluster) as client:

            cluster.scale(2)

            client.wait_for_workers(2)
            workers = Job._call(["condor_q", "-af", "jobpid"]).strip()
            assert workers, "we got dask workers"

            cluster.scale(0)

            start = time()
            while client.scheduler_info()["workers"]:
                sleep(0.100)

                workers = Job._call(["condor_q", "-af", "jobpid"]).strip()
                assert workers, "killing workers with broken cancel_command didn't fail"

                if time() > start + QUEUE_WAIT // 3:
                    return
Example #4
0
def test_header():
    with HTCondorCluster(cores=1, memory="100MB", disk="100MB") as cluster:
        assert cluster._dummy_job.job_header_dict["MY.DaskWorkerCores"] == 1
        assert cluster._dummy_job.job_header_dict[
            "MY.DaskWorkerDisk"] == 100000000
        assert cluster._dummy_job.job_header_dict[
            "MY.DaskWorkerMemory"] == 100000000
Example #5
0
def test_job_script():
    with HTCondorCluster(cores=4,
                         processes=2,
                         memory='100MB',
                         disk='100MB',
                         env_extra=[
                             'export LANG="en_US.utf8"',
                             'export LC_ALL="en_US.utf8"'
                         ],
                         job_extra={'+Extra': "True"}) as cluster:
        job_script = cluster.job_script()
        assert 'RequestCpus = MY.DaskWorkerCores' in job_script
        assert 'RequestDisk = floor(MY.DaskWorkerDisk / 1024)' in job_script
        assert 'RequestMemory = floor(MY.DaskWorkerMemory / 1048576)' in job_script
        assert 'MY.DaskWorkerCores = 4' in job_script
        assert 'MY.DaskWorkerDisk = 100000000' in job_script
        assert 'MY.DaskWorkerMemory = 100000000' in job_script
        assert 'MY.JobId = "$(ClusterId).$(ProcId)"' in job_script
        assert 'LANG=en_US.utf8' in job_script
        assert 'LC_ALL=en_US.utf8' in job_script
        assert 'JOB_ID=$F(MY.JobId)' in job_script
        assert 'export' not in job_script
        assert '+Extra = True' in job_script

        assert '{} -m distributed.cli.dask_worker tcp://'.format(
            sys.executable) in job_script
        assert '--memory-limit 50.00MB' in job_script
        assert '--nthreads 2' in job_script
        assert '--nprocs 2' in job_script
Example #6
0
def test_config_name_htcondor_takes_custom_config():
    conf = {
        "cores": 1,
        "memory": "120 MB",
        "disk": "120 MB",
        "job-extra": [],
        "name": "myname",
        "processes": 1,
        "interface": None,
        "death-timeout": None,
        "extra": [],
        "env-extra": [],
        "log-directory": None,
        "shebang": "#!/usr/bin/env condor_submit",
        "local-directory": "/tmp",
    }

    with dask.config.set({"jobqueue.htcondor-config-name": conf}):
        with HTCondorCluster(config_name="htcondor-config-name") as cluster:
            assert cluster.job_name == "myname"
Example #7
0
def test_config_name_htcondor_takes_custom_config():
    conf = {
        'cores': 1,
        'memory': '120 MB',
        'disk': '120 MB',
        'job-extra': [],
        'name': 'myname',
        'processes': 1,
        'interface': None,
        'death-timeout': None,
        'extra': [],
        'env-extra': [],
        'log-directory': None,
        'shebang': "#!/usr/bin/env condor_submit",
        'local-directory': "/tmp",
    }

    with dask.config.set({'jobqueue.htcondor-config-name': conf}):
        with HTCondorCluster(config_name='htcondor-config-name') as cluster:
            assert cluster.name == 'myname'
def main():
    n_port = 8786
    with HTCondorCluster(cores=1,
                         memory='100MB',
                         disk='100MB',
                         death_timeout='60',
                         nanny=True,
                         scheduler_options={
                             'port': n_port,
                             'host': socket.gethostname()
                         },
                         job_extra={
                             'should_transfer_files': 'Yes',
                             'when_to_transfer_output': 'ON_EXIT'
                         },
                         extra=['--worker-port {}'.format(n_port)]) as cluster:
        with Client(cluster) as client:
            cluster.scale(1)
            future = client.submit(lambda x: x + 1, 10)
            print('Result is {}'.format(future.result()))
def main(args):
    base_dir = args.base_dir
    n_workers = args.n_workers

    input_files = [base_dir + "/" + fl for fl in os.listdir(base_dir)]
    print("Found {} files in {}".format(len(input_files), base_dir))

    start = time()

    n_port = 8786
    with HTCondorCluster(cores=1,
                         memory="500MB",
                         disk="500MB",
                         death_timeout="60",
                         nanny=False,
                         scheduler_options={
                             "port": n_port,
                             "host": socket.gethostname()
                         },
                         job_extra={
                             "should_transfer_files": "Yes",
                             "when_to_transfer_output": "ON_EXIT",
                             "+JobFlavour": "espresso"
                         },
                         extra=["--worker-port {}".format(n_port)]) as cluster:
        #print(cluster.job_script())
        with Client(cluster) as client:
            cluster.scale(n_workers)
            futures = client.map(dummy_extractor, input_files)
            arrays = client.gather(futures)

    end = time()

    final_arr = ak.concatenate(arrays)

    print("Done concatenating")
    print(final_arr.type)
    print("Computation time: {}".format(
        str(timedelta(seconds=int(end - start)))))
Example #10
0
def test_job_script():
    with HTCondorCluster(
            cores=4,
            processes=2,
            memory="100MB",
            disk="100MB",
            env_extra=[
                'export LANG="en_US.utf8"', 'export LC_ALL="en_US.utf8"'
            ],
            job_extra={"+Extra": "True"},
            submit_command_extra=["-verbose"],
            cancel_command_extra=["-forcex"],
    ) as cluster:
        job_script = cluster.job_script()
        assert "RequestCpus = MY.DaskWorkerCores" in job_script
        assert "RequestDisk = floor(MY.DaskWorkerDisk / 1024)" in job_script
        assert "RequestMemory = floor(MY.DaskWorkerMemory / 1048576)" in job_script
        assert "MY.DaskWorkerCores = 4" in job_script
        assert "MY.DaskWorkerDisk = 100000000" in job_script
        assert "MY.DaskWorkerMemory = 100000000" in job_script
        assert 'MY.JobId = "$(ClusterId).$(ProcId)"' in job_script
        assert "LANG=en_US.utf8" in job_script
        assert "LC_ALL=en_US.utf8" in job_script
        assert "export" not in job_script
        assert "+Extra = True" in job_script
        assert re.search(r"condor_submit\s.*-verbose",
                         cluster._dummy_job.submit_command)
        assert re.search(r"condor_rm\s.*-forcex",
                         cluster._dummy_job.cancel_command)

        assert ("{} -m distributed.cli.dask_worker tcp://".format(
            sys.executable) in job_script)
        formatted_bytes = format_bytes(parse_bytes("50MB")).replace(" ", "")
        assert f"--memory-limit {formatted_bytes}" in job_script
        assert "--nthreads 2" in job_script
        assert "--nprocs 2" in job_script
Example #11
0
                    require_encryption=True)

HTCondorJob.submit_command = "condor_submit -spool"

cluster = HTCondorCluster(
    cores=4,
    memory="2GB",
    disk="1GB",
    log_directory="logs",
    silence_logs="debug",
    scheduler_options={
        "dashboard_address": "8786",
        "port": 8787,
        "external_address": "129.93.183.33:8787"
    },
    # HTCondor submit script
    job_extra={
        "universe": "docker",
        # To be used with coffea-casa:0.1.11
        "encrypt_input_files": "/etc/cmsaf-secrets/xcache_token",
        #"docker_network_type": "host",
        "docker_image": "coffeateam/coffea-casa-analysis:0.1.11",
        "container_service_names": "dask",
        "dask_container_port": "8787",
        "should_transfer_files": "YES",
        "when_to_transfer_output": "ON_EXIT",
        "+DaskSchedulerAddress": '"129.93.183.33:8787"',
    })

cluster.adapt(minimum_jobs=5, maximum_jobs=100, maximum_memory="4 GB"
              )  # auto-scale between 5 and 100 jobs (maximum_memory="4 GB")
Example #12
0
from tdub import setup_logging
from tdub.train import prepare_from_root
from tdub.utils import get_selection, get_features, quick_files

import lightgbm as lgbm
from sklearn.model_selection import train_test_split

from dask_jobqueue import HTCondorCluster
from dask.distributed import Client
from dask_ml.model_selection import GridSearchCV

cluster = HTCondorCluster(cores=2, disk="4GB", memory="8GB")
client = Client(cluster)
cluster.adapt(maximum_jobs=200)

setup_logging()

qf = quick_files("/atlasgpfs01/usatlas/data/ddavis/wtloop/v29_20191111")

df, y, w = prepare_from_root(qf["tW_DR"], qf["ttbar"], "1j1b")

X_train, X_test, y_train, y_test, w_train, w_test = train_test_split(
    df, y, w, train_size=0.8, random_state=414, shuffle=True)

n_sig = y_train[y_train == 1].shape[0]
n_bkg = y_train[y_train == 0].shape[0]
spw = n_bkg / n_sig

n_sig = y[y == 1].shape[0]
n_bkg = y[y == 0].shape[0]
spw = n_bkg / n_sig
Example #13
0
def test_header():
    with HTCondorCluster(cores=1, memory='100MB', disk='100MB') as cluster:
        assert cluster.job_header_dict['MY.DaskWorkerCores'] == 1
        assert cluster.job_header_dict['MY.DaskWorkerDisk'] == 100000000
        assert cluster.job_header_dict['MY.DaskWorkerMemory'] == 100000000
     if not check_port(8786):
         raise RuntimeError(
             "Port '8786' is not occupied on this node. Try another one."
         )
     import socket
     cluster = HTCondorCluster(
         cores=1,
         memory='2GB',  # hardcoded
         disk='1GB',
         death_timeout='60',
         nanny=False,
         scheduler_options={
             'port': n_port,
             'host': socket.gethostname()
         },
         job_extra={
             'log': 'dask_job_output.log',
             'output': 'dask_job_output.out',
             'error': 'dask_job_output.err',
             'should_transfer_files': 'Yes',
             'when_to_transfer_output': 'ON_EXIT',
             '+JobFlavour': '"workday"',
         },
         extra=['--worker-port {}'.format(n_port)],
         env_extra=env_extra,
     )
 elif 'slurm' in args.executor:
     cluster = SLURMCluster(
         queue='all',
         cores=args.workers,
         processes=args.workers,
Example #15
0
HTCondorJob.submit_command = "condor_submit -spool"

cluster = HTCondorCluster(
    cores=4,
    memory="2GB",
    disk="1GB",
    log_directory="logs",
    silence_logs="debug",
    scheduler_options={
        "dashboard_address": "8786",
        "port": 8787,
        "external_address": "129.93.183.33:8787"
    },
    # HTCondor submit script
    job_extra={
        "universe": "docker",
        # To be used with coffea-casa:0.1.7
        "transfer_input_files":
        "/etc/cmsaf-secrets/xcache_token,/etc/cmsaf-secrets/ca.pem,/etc/cmsaf-secrets/usercert.pem",
        "encrypt_input_files":
        "/etc/cmsaf-secrets/xcache_token,/etc/cmsaf-secrets/ca.pem,/etc/cmsaf-secrets/usercert.pem",
        #"docker_network_type": "host",
        "docker_image": "oshadura/coffea-casa-analysis:0.1.7",
        "container_service_names": "dask",
        "dask_container_port": "8787",
        "should_transfer_files": "YES",
        "when_to_transfer_output": "ON_EXIT",
        "+DaskSchedulerAddress": '"129.93.183.33:8787"',
    })

cluster.scale(jobs=2)
Example #16
0
        from dask.distributed import performance_report

        if 'slurm' in args.executor:
            cluster = SLURMCluster(
                queue='all',
                cores=args.workers,
                processes=args.workers,
                memory="200 GB",
                retries=10,
                walltime='00:30:00',
                env_extra=env_extra,
            )
        elif 'condor' in args.executor:
            cluster = HTCondorCluster(
                cores=args.workers,
                memory='4GB',
                disk='4GB',
                env_extra=env_extra,
            )

        if args.executor == 'dask/casa':
            client = Client("tls://localhost:8786")
            # client = Client("tls://andrzej-2enovak-40cern-2ech.dask.coffea.casa:8786")
            #             import shutil
            #             shutil.make_archive("workflows", "zip", base_dir="workflows")
            #             client.upload_file("workflows.zip")
            from distributed.diagnostics.plugin import UploadDirectory
            #             client.register_worker_plugin(UploadDirectory("workflows", restart=True, update_path=True), nanny=True)
            client.register_worker_plugin(UploadDirectory("workflows",
                                                          restart=False,
                                                          update_path=True),
                                          nanny=True)