Example #1
0
    def create_sge_cluster(self):
        workers = self.config.parallel
        queue = self.config.sge_options.queue
        queue = ",".join([q.strip() for q in queue.split(',')])
        memory = self.config.sge_options.memory
        processes = int(self.config.sge_options.processes)
        cores = int(self.config.sge_options.cores)
        resource_spec = self.config.sge_options.resource_spec
        job_extra = self.config.sge_options.job_extra
        print("SGE:", "queue=", queue, "memory=", memory, "processes=",
              processes, "cores=", cores, "resource_spec=", resource_spec,
              "job_extra=", job_extra)
        cluster = SGECluster(
            queue=queue,
            processes=processes,
            memory=memory,
            cores=cores,
            resource_spec=resource_spec,
            name="sgains-tools",
            job_extra=job_extra,
            walltime='08:00:00',
            dashboard_address='0.0.0.0:28787',
        )
        cluster.adapt(minimum=workers, maximum=workers)
        print("SGE cluster dashboard link:", cluster.dashboard_link)
        print(cluster)
        print(cluster.job_script())
        # print(cluster.job_file())
        print("SGE cluster dashboard link:", cluster.dashboard_link)

        return cluster
def init_cluster(args):
    env_extra = [
        "#$ -e {}".format(args.log_dir or "/dev/null"),
        "#$ -o {}".format(args.log_dir or "/dev/null"),
        "#$ -pe serial {}".format(args.ngpus if args.ngpus > 0 else args.ncpus),
        "export LANG=en_US.UTF-8",
        "export LC_ALL=en_US.UTF-8",
        "export MKL_NUM_THREADS=1",
        "export NUMEXPR_NUM_THREADS=1",
        "export OMP_NUM_THREADS=1",
        "export DISABLE_MP_CACHE=1",
    ]
    cluster = SGECluster(
        queue=args.queue,
        resource_spec="h_vmem={}G,mem_req={}G".format(args.h_vmem, args.mem_req),
        walltime="720:00:00",
        name="test_Dask_PytorchDataloader",
        cores=args.ncpus,
        memory="{}G".format(args.mem_req),
        processes=1,
        interface="ib0",
        local_directory=".",
        env_extra=env_extra,
        spill_dir=".",
        extra=["--no-nanny"],
    )
    cluster.scale(args.jobs)
    return cluster
Example #3
0
def setup_client_and_cluster(number_processes=1,
                             number_jobs=1,
                             walltime="00:01:00",
                             memory=1):
    """
    Setup Dask client and cluster.
    Ensure that the number of workers is the right amount
    for your job and will be fully utilised.
    """
    print("Setting up Dask client and cluster ...")
    # number of workers used for number of partitions
    number_workers = number_processes * number_jobs
    # these are the requirements for a single worker
    cluster = SGECluster(
        interface="ib0",
        walltime=walltime,
        memory=f"{memory} G",
        resource_spec=f"h_vmem={memory}G",
        scheduler_options={"dashboard_address": ":2727"},
        job_extra=[
            "-V",  # export all environment variables
            f"-pe smp {number_processes}",
            f"-l disk={memory}G",
        ],
        local_directory=os.sep.join(
            [os.environ.get("PWD"), "dask-worker-space"]),
    )
    client = Client(cluster)
    cluster.scale(jobs=number_jobs)
    print("The resources of each worker are: ")
    print(cluster.job_script())
    return client, cluster
def scale_to_sge(n_workers):
    queue="q_gpu"
    queue_resource_spec="q_gpu=TRUE"
    memory="4GB"
    sge_log= "./logs"
    from dask_jobqueue import SGECluster
    cluster = SGECluster(queue=queue, memory=memory, cores=1, processes=1,
                         log_directory=sge_log,
                         local_directory=sge_log,
                         resource_spec=queue_resource_spec)
    cluster.scale_up(n_workers)
    return Client(cluster)
Example #5
0
File: cluster.py Project: wx-b/rlbc
def make_client(cluster, type_workers, num_workers, log_dir, no_nanny=False):
    """
    no_nanny option is there to allow workers to create their own workers.
    usefull if you have gpu workers creating their own cpu workers for data loading.
    """
    if no_nanny:
        extra = ['--no-nanny', '--no-bokeh']
        processes = False
    else:
        extra = []
        processes = True

    if cluster == 'paris':
        from dask_jobqueue import SGECluster

        job_extra = [
            '-pe serial 1',
            '--stdout={}'.format(os.path.join(log_dir, '%jobid%_stdout.txt')),
            '--stderr={}'.format(os.path.join(log_dir, '%jobid%_stderr.txt'))
        ]

        cluster = SGECluster(
            queue='gaia.q,chronos.q,titan.q,zeus.q',
            resource_spec='h_vmem=2000000M,mem_req=2000M',
            job_extra=['-pe serial 1'],
            env_extra=[
                'source /sequoia/data1/rstrudel/miniconda3/etc/profile.d/conda.sh',
                'conda activate bullet', 'export LANG=en_US.UTF-8',
                'export LC_ALL=en_US.UTF-8',
                'export PYTHONUNBUFFERED=non_empty'
            ],
            walltime='720:00:00',
            memory='4GB',
            extra=extra,
            cores=1,
            local_directory=os.path.join('/sequoia/data2', getpass.getuser(),
                                         'dask'))
        cluster.start_workers(num_workers)
    elif cluster == 'grenoble':
        from bc.utils.dask_grenoble import GPUCluster

        dask_log_dir = log_dir.replace('agents', 'dask').replace('/seed', '-s')
        if not os.path.exists(dask_log_dir):
            os.mkdir(dask_log_dir)
        cluster = GPUCluster(
            extra=['--no-nanny', '--no-bokeh'],
            walltime='72:00:00',
            log_dir=dask_log_dir,
            besteffort=True,
            interface_node='edgar',
        )
        # cluster.start_workers(num_gpus)
        cluster.adapt(minimum=0, maximum=num_workers)
    elif cluster == 'local':
        cluster = LocalCluster(processes=processes)
    else:
        raise ValueError('Unknown cluster name: {}'.format(cluster))

    client = Client(cluster)
    return client
Example #6
0
def test_job_script(tmpdir):
    log_directory = tmpdir.strpath
    with SGECluster(
            cores=6,
            processes=2,
            memory="12GB",
            queue="my-queue",
            project="my-project",
            walltime="02:00:00",
            env_extra=["export MY_VAR=my_var"],
            job_extra=["-w e", "-m e"],
            log_directory=log_directory,
            resource_spec="h_vmem=12G,mem_req=12G",
    ) as cluster:
        job_script = cluster.job_script()
        for each in [
                "--nprocs 2",
                "--nthreads 3",
                "--memory-limit 6.00GB",
                "-q my-queue",
                "-P my-project",
                "-l h_rt=02:00:00",
                "export MY_VAR=my_var",
                "#$ -w e",
                "#$ -m e",
                "#$ -e {}".format(log_directory),
                "#$ -o {}".format(log_directory),
                "-l h_vmem=12G,mem_req=12G",
                "#$ -cwd",
                "#$ -j y",
        ]:
            assert each in job_script
Example #7
0
def test_config_name_sge_takes_custom_config():
    conf = {
        "queue": "myqueue",
        "project": "myproject",
        "ncpus": 1,
        "cores": 1,
        "memory": "2 GB",
        "walltime": "00:02",
        "job-extra": [],
        "name": "myname",
        "processes": 1,
        "interface": None,
        "death-timeout": None,
        "local-directory": "/foo",
        "extra": [],
        "env-extra": [],
        "log-directory": None,
        "shebang": "#!/usr/bin/env bash",
        "job-cpu": None,
        "job-mem": None,
        "resource-spec": None,
    }

    with dask.config.set({"jobqueue.sge-config-name": conf}):
        with SGECluster(config_name="sge-config-name") as cluster:
            assert cluster.name == "myname"
Example #8
0
def test_basic(loop):
    with SGECluster(walltime="00:02:00",
                    cores=8,
                    processes=4,
                    memory="2GB",
                    loop=loop) as cluster:
        with Client(cluster, loop=loop) as client:

            cluster.scale(2)

            start = time()
            while not (cluster.pending_jobs or cluster.running_jobs):
                sleep(0.100)
                assert time() < start + QUEUE_WAIT

            future = client.submit(lambda x: x + 1, 10)
            assert future.result(QUEUE_WAIT) == 11
            assert cluster.running_jobs

            workers = list(client.scheduler_info()["workers"].values())
            w = workers[0]
            assert w["memory_limit"] == 2e9 / 4
            assert w["nthreads"] == 2

            cluster.scale(0)

            start = time()
            while cluster.running_jobs:
                sleep(0.100)
                assert time() < start + QUEUE_WAIT
Example #9
0
def setup_cluster(
        memory='2G',
        gpus=0,
        log_dir=None,
        timeout_s=str(3600 * 24 * 7),  # a week
        proc_per_worker=1,
        cores_per_proc=1,
        env_extra=None,
        job_extra=None,
        grid='clsp',
        *args,
        **kwargs) -> SGECluster:
    if env_extra is None:
        env_extra = []
    # We're creating the "qsub"-like resource specifiation here
    resource_spec = ''
    queue = 'all.q'

    if grid == 'clsp':
        # Add memory specification (CLSP grid specific)
        qsub_mem_str = f'mem_free={memory},ram_free={memory}'.replace(
            'GB', 'G')

        # Handle GPU jobs
        if gpus:
            # Nun GPUs arg + limit hosts to c nodes (with PyTorch compatible GPUs)
            resource_spec += f',gpu={gpus},hostname=c*'
            # Set the queu as needed
            queue = 'g.q'
            # Check which GPU is free to use
            env_extra.append(
                f'export CUDA_VISIBLE_DEVICES=$(free-gpu -n {gpus})')

    elif grid == 'coe':
        # Add memory specification (CLSP grid specific)
        qsub_mem_str = f'mem_free={memory}'.replace('GB', 'G')

        # Handle GPU jobs
        if gpus:
            # Nun GPUs arg + limit hosts to c nodes (with PyTorch compatible GPUs)
            resource_spec += f',gpu={gpus}'
            # Set the queu as needed
            queue = 'gpu.q'

    resource_spec += qsub_mem_str
    # Create a "mini cluster" that our jobs will get submitted to
    return SGECluster(
        queue=queue,
        walltime=timeout_s,
        processes=proc_per_worker,
        memory=memory,
        cores=cores_per_proc,
        resource_spec=resource_spec,
        log_directory=log_dir if log_dir is not None else 'log',
        job_extra=job_extra,
        env_extra=
        env_extra,  # e.g. ['export ENV_VARIABLE="SOMETHING"', 'source myscript.sh']
        *args,
        **kwargs,
    )
Example #10
0
def test_config_name_sge_takes_custom_config():
    conf = {
        'queue': 'myqueue',
        'project': 'myproject',
        'ncpus': 1,
        'cores': 1,
        'memory': '2 GB',
        'walltime': '00:02',
        'job-extra': [],
        'name': 'myname',
        'processes': 1,
        'interface': None,
        'death-timeout': None,
        'local-directory': '/foo',
        'extra': [],
        'env-extra': [],
        'log-directory': None,
        'shebang': '#!/usr/bin/env bash',
        'job-cpu': None,
        'job-mem': None,
        'resource-spec': None
    }

    with dask.config.set({'jobqueue.sge-config-name': conf}):
        with SGECluster(config_name='sge-config-name') as cluster:
            assert cluster.name == 'myname'
Example #11
0
def test_basic(loop):  # noqa: F811
    with SGECluster(walltime='00:02:00',
                    cores=8,
                    processes=4,
                    memory='2GB',
                    loop=loop) as cluster:
        print(cluster.job_script())
        with Client(cluster, loop=loop) as client:

            cluster.scale(2)

            start = time()
            while not (cluster.pending_jobs or cluster.running_jobs):
                sleep(0.100)
                assert time() < start + QUEUE_WAIT

            future = client.submit(lambda x: x + 1, 10)
            assert future.result(QUEUE_WAIT) == 11
            assert cluster.running_jobs

            workers = list(client.scheduler_info()['workers'].values())
            w = workers[0]
            assert w['memory_limit'] == 2e9 / 4
            assert w['ncores'] == 2

            cluster.scale(0)

            start = time()
            while cluster.running_jobs:
                sleep(0.100)
                assert time() < start + QUEUE_WAIT
Example #12
0
def test_basic(loop):  # noqa: F811
    with SGECluster(walltime='00:02:00',
                    cores=8,
                    processes=4,
                    memory='28GB',
                    loop=loop) as cluster:
        with Client(cluster, loop=loop) as client:
            workers = cluster.start_workers(2)
            future = client.submit(lambda x: x + 1, 10)
            assert future.result(60) == 11
            assert cluster.jobs

            info = client.scheduler_info()
            for w in info['workers'].values():
                assert w['memory_limit'] == 7e9
                assert w['ncores'] == 2

            cluster.stop_workers(workers)

            start = time()
            while len(client.scheduler_info()['workers']) > 0:
                sleep(0.100)
                assert time() < start + 10

            assert not cluster.jobs
Example #13
0
    def get_cluster(which="ccin2p3", scale=None, set_client=True, **kwargs):
        """ """
        
        if which == "ccin2p3":
            from dask_jobqueue import SGECluster
            prop = dict(name="dask-worker",  walltime="06:00:00",
                        memory='8GB', death_timeout=120, project="P_ztf",
                        resource_spec='sps=1', cores=1, processes=1)
            
            cluster = SGECluster(**{**prop,**kwargs})
        else:
            raise NotImplementedError(f"only 'ccin2p3' cluster implemented {which} given")

        if scale is not None:
            cluster.scale( int(scale) )

        return cluster
Example #14
0
def process_dask(
    funcs,
    jobs=10,
    cores=3,
    processes=3,
    h_vmem=20,
    m_mem_free=5,
    h_rt=3000,
):
    cluster = SGECluster(
        n_workers=0,
        job_cls=None,
        loop=None,
        security=None,
        silence_logs='error',
        name=None,
        asynchronous=False,
        interface=None,
        host=None,
        protocol='tcp://',
        dashboard_address=':8787',
        config_name=None,
        processes=processes,
        queue='low.q',
        project="labxchem",
        cores=cores,
        memory="{}GB".format(h_vmem),
        walltime=h_rt,
        resource_spec="m_mem_free={}G,h_vmem={}G,h_rt={}".format(
            m_mem_free, h_vmem, h_rt),
    )

    cluster.scale(jobs=jobs)

    client = Client(cluster)

    results_futures = client.map(
        call,
        funcs,
    )

    results = client.gather(results_futures)

    return results
Example #15
0
def init_cluster(name, args):
    resource_spec = "h_vmem={}G,mem_req={}G".format(args.h_vmem, args.mem_req)
    exclude_nodes = "&".join(["!" + x for x in args.exclude_nodes])
    if len(exclude_nodes) > 0:
        exclude_nodes = "#$ -l h=" + exclude_nodes
    env_extra = [
        "#$ -e {}".format(args.log_dir or "/dev/null"),
        "#$ -o {}".format(args.log_dir or "/dev/null"),
        "#$ -pe serial {}".format(
            args.ngpus if args.ngpus > 0 else args.ncpus),
        exclude_nodes,
        "source " + args.to_source if args.to_source is not None else "",
        "export LANG=en_US.UTF-8",
        "export LC_ALL=en_US.UTF-8",
        "export MKL_NUM_THREADS=1",
        "export NUMEXPR_NUM_THREADS=1",
        "export OMP_NUM_THREADS=1",
        "export DISABLE_MP_CACHE=1",
        "export TORCH_HOME=/sequoia/data1/rriochet/.torch",
    ]
    for var in args.export_var:
        env_extra.append(f'export {var}="{os.environ[var]}"')
    cluster = SGECluster(
        queue=args.queue,
        resource_spec=resource_spec,
        walltime="720:00:00",
        name=name,
        cores=args.ncpus,
        memory="{}G".format(args.mem_req),
        processes=1,
        interface="ib0",
        local_directory=args.log_dir,
        env_extra=env_extra,
        spill_dir=args.spill_dir,
        extra=["--no-nanny"],
    )
    # cluster.adapt(maximum_jobs=args.jobs)
    cluster.scale(args.jobs)
    return cluster
Example #16
0
def get_client():

    dask.config.set({"distributed.admin.tick.limit": "300s"})

    cluster = SGECluster(
        queue="medium.q",
        project="labxchem",
        cores=10,
        processes=5,
        memory="64GB",
        resource_spec="m_mem_free=64G,redhat_release=rhel7",
        python=
        "/dls/science/groups/i04-1/conor_dev/ccp4/build/bin/cctbx.python",
        walltime="03:00:00",
    )
    cluster.scale(60)

    time.sleep(15)

    client = Client(cluster)

    return client
Example #17
0
    def run(self,
            block: bool = True,
            cluster: bool = False,
            cluster_kwargs: dict = None,
            workers: int = 8,
            debug: bool = False) -> Union[Future, Any]:
        """Run the pipeline.

        Parameters
        ----------
        block
            When True (the default), block until completion. Otherwise, return
            a :class:`Future`.
        cluster
            When True, run on rhino's SGE cluster (default: False).
        cluster_kwargs
            A dict of keyword arguments to pass to :class:`SGECluster`. See
            ``CLUSTER_DEFAULTS`` for default values.
        workers
            Number of workers to use when running on the SGE cluster
            (default: 8).
        debug
            When True, disable the cluster and use the single-threaded dask
            scheduler for debugging.

        Returns
        -------
        If ``block`` is set, returns the result of running the pipeline.
        Otherwise returns a :class:`Future` which resolves when the pipeline
        is complete.

        """
        if cluster and not debug:
            from dask_jobqueue import SGECluster
            from dask.distributed import Client

            if cluster_kwargs is None:
                kwargs = CLUSTER_DEFAULTS
            else:
                kwargs = CLUSTER_DEFAULTS.copy()
                kwargs.update(cluster_kwargs)

            cluster = SGECluster(**kwargs)
            cluster.scale(workers)
            _ = Client(cluster)

        if not block and not debug:
            return self._run_async()
        else:
            return self._run_sync(debug)
def main():
    # dask cluster and client
    n_jobs = 20
    n_processes = 1
    n_workers = n_processes * n_jobs

    cluster = SGECluster(
        interface="ib0",
        walltime="02:00:00",
        memory=f"48 G",
        resource_spec=f"h_vmem=48G",
        scheduler_options={
            "dashboard_address": ":7777",
        },
        job_extra=[
            "-cwd",
            "-V",
            f"-pe smp {n_processes}",
            f"-l disk=48G",
        ],
        local_directory=os.sep.join([os.environ.get("PWD"), "dask-hia-space"]),
    )

    client = Client(cluster)
    cluster.scale(jobs=n_jobs)
    time_start = time.time()

    # dask bag and process
    simulations = [f'emulator_Base_CLE_2020_{output}']

    #simulations = []
    #simulations.append(f'wrfchem_Base_CLE_2020_{output}')
    #simulations.append(f'wrfchem_Base_CLE_2050_{output}')
    #simulations.append(f'wrfchem_Base_MFR_2050_{output}')
    #simulations.append(f'wrfchem_SDS_MFR_2050_{output}')

    #for year in ['2020', '2030', '2040', '2050']:
    #    for scenario in ['Base_CLE', 'Base_MFR', 'SDS_MFR']:
    #        for sim in ['', '_RES', '_IND', '_TRA', '_AGR', '_ENE', '_NO_RES', '_NO_IND', '_NO_TRA', '_NO_AGR', '_NO_ENE']:
    #            simulations.append(f'emulator_{scenario}_{year}{sim}_{output}')

    print(f"predicting for {len(simulations)} custom outputs ...")
    bag_simulations = db.from_sequence(simulations, npartitions=n_workers)

    if output == "PM2_5_DRY":
        bag_simulations.map(health_impact_assessment_pm25).compute()
    elif output == "o3_6mDM8h":
        bag_simulations.map(health_impact_assessment_o3).compute()

    time_end = time.time() - time_start
    print(
        f"completed in {time_end:0.2f} seconds, or {time_end / 60:0.2f} minutes, or {time_end / 3600:0.2f} hours"
    )

    client.close()
    cluster.close()
Example #19
0
def main():
    # dask cluster and client
    n_processes = 1
    n_jobs = 35
    n_workers = n_processes * n_jobs

    cluster = SGECluster(interface='ib0',
                         walltime='01:00:00',
                         memory=f'64 G',
                         resource_spec=f'h_vmem=64G',
                         scheduler_options={
                             'dashboard_address': ':5757',
                         },
                         job_extra=['-cwd', '-V', f'-pe smp {n_processes}'],
                         local_directory=os.sep.join(
                             [os.environ.get('PWD'), 'dask-worker-space']))

    client = Client(cluster)

    cluster.scale(jobs=n_jobs)

    time_start = time.time()

    # regrid custom outputs to pop grid
    custom_outputs = glob.glob(path + 'ds*' + output + '.nc')
    custom_outputs_completed = glob.glob(path + 'ds*' + output +
                                         '_popgrid_0.05deg.nc')
    custom_outputs_completed = [
        f'{item[0:-19]}.nc' for item in custom_outputs_completed
    ]
    custom_outputs_remaining_set = set(custom_outputs) - set(
        custom_outputs_completed)
    custom_outputs_remaining = [item for item in custom_outputs_remaining_set]
    print(
        f'custom outputs remaining for {output}: {len(custom_outputs_remaining)}'
    )

    # dask bag and process
    custom_outputs_remaining = custom_outputs_remaining[
        0:
        2500]  # run in 2,500 chunks over 30 cores, each chunk taking 5 minutes
    print(f'predicting for {len(custom_outputs_remaining)} custom outputs ...')
    bag_custom_outputs = db.from_sequence(custom_outputs_remaining,
                                          npartitions=n_workers)
    bag_custom_outputs.map(regrid_to_pop).compute()

    time_end = time.time() - time_start
    print(
        f'completed in {time_end:0.2f} seconds, or {time_end / 60:0.2f} minutes, or {time_end / 3600:0.2f} hours'
    )
    print(
        f'average time per custom output is {time_end / len(custom_outputs_remaining):0.2f} seconds'
    )

    client.close()
    cluster.close()
Example #20
0
def main():
    # dask cluster and client
    n_processes = 1
    n_jobs = 35
    n_workers = n_processes * n_jobs

    cluster = SGECluster(
        interface="ib0",
        walltime="48:00:00",
        memory=f"12 G",
        resource_spec=f"h_vmem=12G",
        scheduler_options={
            "dashboard_address": ":5757",
        },
        job_extra=[
            "-cwd",
            "-V",
            f"-pe smp {n_processes}",
            f"-l disk=1G",
        ],
        local_directory=os.sep.join(
            [os.environ.get("PWD"), "dask-worker-space_popweighted_region"]),
    )

    client = Client(cluster)

    cluster.scale(jobs=n_jobs)

    # main processing
    matrix_stacked = np.array(
        np.meshgrid(
            np.linspace(0, 1.4, 8),
            np.linspace(0, 1.4, 8),
            np.linspace(0, 1.4, 8),
            np.linspace(0, 1.4, 8),
            np.linspace(0, 1.4, 8),
        )).T.reshape(-1, 5)

    custom_inputs = [np.array(item).reshape(1, -1) for item in matrix_stacked]

    print(f"processing for {output} over {region} ...")
    outputs_popweighted = []
    bag_custom_inputs = db.from_sequence(custom_inputs, npartitions=n_workers)
    outputs_popweighted = bag_custom_inputs.map(
        popweight_outputs_for_input).compute()

    print("saving ...")
    joblib.dump(
        outputs_popweighted,
        f"/nobackup/earlacoa/machinelearning/data_annual/popweighted/popweighted_{region}_{output}_0.25deg_adjusted_scaled.joblib",
    )

    client.close()
    cluster.close()
def start_dask(num_workers, msg, logger):
    """Context manager used for starting/shutting down dask

    Args:
        num_workers (`int`): Number of dask workers
        msg (`str`): Message for timer
        logger: The logger being used

    Yields:
        client: Dask client
    """

    # Update dask
    with open("dask-config.yaml") as f:
        config = yaml.load(f, Loader=SafeLoader)
        dask.config.update(dask.config.config, config)

    cluster_type = next(iter(dask.config.config['jobqueue']))
    set_local_directory(cluster_type)

    if cluster_type == 'local':
        from dask.distributed import LocalCluster
        cluster = LocalCluster(n_workers=num_workers, threads_per_worker=1)
    else:
        if cluster_type == 'lsf':
            from dask_jobqueue import LSFCluster
            cluster = LSFCluster()
        elif cluster_type == 'slurm':
            from dask_jobqueue import SLURMCluster
            cluster = SLURMCluster()
        elif cluster_type == 'sge':
            from dask_jobqueue import SGECluster
            cluster = SGECluster()
        cluster.scale(num_workers)
    try:
        with io_util.Timing_Messager(f"Starting dask cluster for {msg}",
                                     logger):
            client = Client(cluster)
        io_util.print_with_datetime(
            f"Check {client.cluster.dashboard_link} for {msg} status.", logger)
        yield client
    finally:
        client.shutdown()
        client.close()
Example #22
0
def test_complex_cancel_command(loop):
    with SGECluster(
        walltime="00:02:00", cores=1, processes=1, memory="2GB", loop=loop
    ) as cluster:
        username = "******"
        cluster.cancel_command = "qdel -u {}".format(username)

        cluster.scale(2)

        start = time()
        while not cluster.running_jobs:
            sleep(0.100)
            assert time() < start + QUEUE_WAIT

        cluster.stop_all_jobs()

        start = time()
        while cluster.running_jobs:
            sleep(0.100)
            assert time() < start + QUEUE_WAIT
def Start_Client(gpu_name):

    hostname = socket.gethostname()
    n_workers = 1
    n_cores = 1

    wks2 = "wn-wks2.fe.hhi.de"
    gpu1 = "wn-gpu1.fe.hhi.de"
    gpu2 = "wn-gpu-104-01.fe.hhi.de"

    if hostname == wks2:
        path = "/data/cluster/projects/infineon-radar/daq_x-har/3_Walking_converted/recording-2020-01-28_11-31-55"
        mem = "20G"  # Allocated memory is critical. For this example it must be at least 16GB
        q = "wn-37.q"  # Check current queue status on https://hpc-management.fe.hhi.de/wn/phpqstat/

        cluster = SGECluster(
            n_workers=n_workers,
            cores=n_cores,
            memory=mem,
            resource_spec=f"h_vmem={mem}",
            host=hostname,
            queue=q,
            job_extra=[
                "-v MKL_NUM_THREADS=1,NUMEXPR_NUM_THREADS=1,OMP_NUM_THREADS=1"
            ])
    elif hostname in (gpu1, gpu2):
        os.environ[
            "CUDA_VISIBLE_DEVICES"] = gpu_name  # Check current status with nvidia-smi and pick GPU from 0-3
        cluster = LocalCluster(n_workers=n_workers,
                               threads_per_worker=n_cores,
                               host=hostname)
    else:
        raise ValueError(
            f"{hostname} is not a supported host. Please run this example on {wks}, {gpu1} or {gpu2}."
        )

    client = Client(cluster)
    client.wait_for_workers(n_workers=n_workers)
    print(client)

    return client
def main():
    # dask cluster and client
    n_processes = 1
    n_jobs = 35
    n_workers = n_processes * n_jobs

    cluster = SGECluster(interface='ib0',
                         walltime='48:00:00',
                         memory=f'12 G',
                         resource_spec=f'h_vmem=12G',
                         scheduler_options={
                             'dashboard_address': ':5757',
                         },
                         job_extra=[
                             '-cwd',
                             '-V',
                             f'-pe smp {n_processes}',
                             f'-l disk=1G',
                         ],
                         local_directory=os.sep.join(
                             [os.environ.get('PWD'), 'dask-worker-space']))

    client = Client(cluster)

    cluster.scale(jobs=n_jobs)

    # main processing
    matrix_stacked = np.array(
        np.meshgrid(np.linspace(0, 1.5, 16), np.linspace(0, 1.5, 16),
                    np.linspace(0, 1.5, 16), np.linspace(0, 1.5, 16),
                    np.linspace(0, 1.5, 16))).T.reshape(-1, 5)

    custom_inputs = [np.array(item).reshape(1, -1) for item in matrix_stacked]

    print(f'processing for {output} over {region} ...')
    outputs_popweighted = []
    bag_custom_inputs = db.from_sequence(custom_inputs, npartitions=n_workers)
    outputs_popweighted = bag_custom_inputs.map(
        popweight_outputs_for_input).compute()

    print('saving ...')
    joblib.dump(
        outputs_popweighted,
        '/nobackup/earlacoa/machinelearning/data/popweighted/popweighted_' +
        region + '_' + output + '.joblib')

    client.close()
    cluster.close()
Example #25
0
def test_complex_cancel_command(loop):
    with SGECluster(walltime="00:02:00",
                    cores=1,
                    processes=1,
                    memory="2GB",
                    loop=loop) as cluster:
        with Client(cluster) as client:
            username = "******"
            cluster.cancel_command = "qdel -u {}".format(username)

            cluster.scale(2)

            start = time()
            while not client.scheduler_info()["workers"]:
                sleep(0.100)
                assert time() < start + QUEUE_WAIT

            cluster.scale(0)

            start = time()
            while client.scheduler_info()["workers"]:
                sleep(0.100)
                assert time() < start + QUEUE_WAIT
Example #26
0
def main():
    # dask cluster and client
    number_processes = 1
    number_jobs = 35
    number_workers = number_processes * number_jobs

    cluster = SGECluster(
        interface="ib0",
        walltime="04:00:00",
        memory=f"2 G",
        resource_spec=f"h_vmem=2G",
        scheduler_options={
            "dashboard_address": ":2727",
        },
        job_extra=[
            "-cwd",
            "-V",
            f"-pe smp {number_processes}",
            f"-l disk=1G",
        ],
        local_directory=os.sep.join(
            [os.environ.get("PWD"), "dask-worker-space"]),
    )

    client = Client(cluster)
    cluster.scale(jobs=number_jobs)

    # main processing
    print("processing ...")
    results = []
    bag = db.from_sequence(nums, npartitions=number_workers)
    results = bag.map(weird_function).compute()

    print("saving ...")
    joblib.dump(results, f"/nobackup/${USER}/results.joblib")

    client.close()
    cluster.close()
Example #27
0
def main():
    # dask cluster and client
    number_processes = 1
    number_jobs = 35
    number_workers = number_processes * number_jobs

    cluster = SGECluster(
        interface="ib0",
        walltime="04:00:00",
        memory=f"12 G",
        resource_spec=f"h_vmem=12G",
        scheduler_options={
            "dashboard_address": ":2727",
        },
        job_extra=[
            "-cwd",
            "-V",
            f"-pe smp {number_processes}",
            f"-l disk=1G",
        ],
        local_directory=os.sep.join(
            [os.environ.get("PWD"), "dask-worker-space"]),
    )

    client = Client(cluster)
    cluster.scale(jobs=number_jobs)

    # main processing
    print("processing ...")
    results = []
    bag = db.from_sequence(sims, npartitions=number_workers)
    results = bag.map(create_ozone_metric).compute()
    print("complete")

    client.close()
    cluster.close()
Example #28
0
def main():
    # dask cluster and client
    n_processes = 1
    n_jobs = 35
    n_workers = n_processes * n_jobs

    cluster = SGECluster(interface='ib0',
                         walltime='01:00:00',
                         memory=f'2 G',
                         resource_spec=f'h_vmem=2G',
                         scheduler_options={
                             'dashboard_address': ':5757',
                         },
                         project='admiralty',
                         job_extra=[
                             '-cwd',
                             '-V',
                             f'-pe smp {n_processes}',
                             f'-l disk=1G',
                         ],
                         local_directory=os.sep.join(
                             [os.environ.get('PWD'), 'dask-worker-space']))

    client = Client(cluster)

    cluster.scale(jobs=n_jobs)

    time_start = time.time()

    # custom inputs
    matrix_stacked = np.array(
        np.meshgrid(
            np.linspace(
                0, 1.5, 16
            ),  # 1.5 and 16 for 0.1, 1.5 and 6 for 0.3, 1.4 and 8 for 0.2
            np.linspace(0, 1.5, 16),
            np.linspace(0, 1.5, 16),
            np.linspace(0, 1.5, 16),
            np.linspace(0, 1.5, 16))).T.reshape(-1, 5)
    custom_inputs_set = set(
        tuple(map(float, map("{:.1f}".format, item)))
        for item in matrix_stacked)

    custom_inputs_completed_filenames = glob.glob(
        '/nobackup/earlacoa/machinelearning/data/summary/ds*' + output + '*')
    custom_inputs_completed_list = []
    for custom_inputs_completed_filename in custom_inputs_completed_filenames:
        custom_inputs_completed_list.append([
            float(item) for item in re.findall(
                r'\d+\.\d+', custom_inputs_completed_filename)
        ])

    custom_inputs_completed_set = set(
        tuple(item) for item in custom_inputs_completed_list)
    custom_inputs_remaining_set = custom_inputs_set - custom_inputs_completed_set
    custom_inputs = [
        np.array(item).reshape(1, -1) for item in custom_inputs_remaining_set
    ]
    print(f'custom inputs remaining for {output}: {len(custom_inputs)}')

    # dask bag and process
    custom_inputs = custom_inputs[
        0:5000]  # run in 1,000 chunks over 30 cores, each chunk taking 1 hour
    print(f'predicting for {len(custom_inputs)} custom inputs ...')
    bag_custom_inputs = db.from_sequence(custom_inputs, npartitions=n_workers)
    bag_custom_inputs.map(custom_predicts).compute()

    time_end = time.time() - time_start
    print(
        f'completed in {time_end:0.2f} seconds, or {time_end / 60:0.2f} minutes, or {time_end / 3600:0.2f} hours'
    )
    print(
        f'average time per custom input is {time_end / len(custom_inputs):0.2f} seconds'
    )

    client.close()
    cluster.close()
Example #29
0
    def execute(self, pipeline_context, execution_plan):
        check.inst_param(pipeline_context, "pipeline_context",
                         SystemPipelineExecutionContext)
        check.inst_param(execution_plan, "execution_plan", ExecutionPlan)
        check.param_invariant(
            isinstance(pipeline_context.executor, DaskExecutor),
            "pipeline_context",
            "Expected executor to be DaskExecutor got {}".format(
                pipeline_context.executor),
        )

        check.invariant(
            pipeline_context.instance.is_persistent,
            "Dask execution requires a persistent DagsterInstance",
        )

        step_levels = execution_plan.execution_step_levels()

        pipeline_name = pipeline_context.pipeline_def.name

        instance = pipeline_context.instance

        cluster_type = self.cluster_type
        if cluster_type == "local":
            from dask.distributed import LocalCluster

            cluster = LocalCluster(**self.build_dict(pipeline_name))
        elif cluster_type == "yarn":
            from dask_yarn import YarnCluster

            cluster = YarnCluster(**self.build_dict(pipeline_name))
        elif cluster_type == "ssh":
            from dask.distributed import SSHCluster

            cluster = SSHCluster(**self.build_dict(pipeline_name))
        elif cluster_type == "pbs":
            from dask_jobqueue import PBSCluster

            cluster = PBSCluster(**self.build_dict(pipeline_name))
        elif cluster_type == "moab":
            from dask_jobqueue import MoabCluster

            cluster = MoabCluster(**self.build_dict(pipeline_name))
        elif cluster_type == "sge":
            from dask_jobqueue import SGECluster

            cluster = SGECluster(**self.build_dict(pipeline_name))
        elif cluster_type == "lsf":
            from dask_jobqueue import LSFCluster

            cluster = LSFCluster(**self.build_dict(pipeline_name))
        elif cluster_type == "slurm":
            from dask_jobqueue import SLURMCluster

            cluster = SLURMCluster(**self.build_dict(pipeline_name))
        elif cluster_type == "oar":
            from dask_jobqueue import OARCluster

            cluster = OARCluster(**self.build_dict(pipeline_name))
        elif cluster_type == "kube":
            from dask_kubernetes import KubeCluster

            cluster = KubeCluster(**self.build_dict(pipeline_name))
        else:
            raise ValueError(
                f"Must be providing one of the following ('local', 'yarn', 'ssh', 'pbs', 'moab', 'sge', 'lsf', 'slurm', 'oar', 'kube') not {cluster_type}"
            )

        with dask.distributed.Client(cluster) as client:
            execution_futures = []
            execution_futures_dict = {}

            for step_level in step_levels:
                for step in step_level:
                    # We ensure correctness in sequencing by letting Dask schedule futures and
                    # awaiting dependencies within each step.
                    dependencies = []
                    for step_input in step.step_inputs:
                        for key in step_input.dependency_keys:
                            dependencies.append(execution_futures_dict[key])

                    run_config = dict(pipeline_context.run_config,
                                      execution={"in_process": {}})
                    recon_repo = pipeline_context.pipeline.get_reconstructable_repository(
                    )

                    dask_task_name = "%s.%s" % (pipeline_name, step.key)

                    recon_pipeline = recon_repo.get_reconstructable_pipeline(
                        pipeline_name)

                    future = client.submit(
                        query_on_dask_worker,
                        dependencies,
                        recon_pipeline,
                        pipeline_context.pipeline_run,
                        run_config,
                        [step.key],
                        pipeline_context.mode_def.name,
                        instance.get_ref(),
                        key=dask_task_name,
                        resources=get_dask_resource_requirements(step.tags),
                    )

                    execution_futures.append(future)
                    execution_futures_dict[step.key] = future

            # This tells Dask to awaits the step executions and retrieve their results to the
            # master
            futures = dask.distributed.as_completed(execution_futures,
                                                    with_results=True)

            # Allow interrupts while waiting for the results from Dask
            for future, result in iterate_with_context(
                    raise_interrupts_immediately, futures):
                for step_event in result:
                    check.inst(step_event, DagsterEvent)
                    yield step_event
Example #30
0
    def _init_dask(self):
        """
        Starts a dask cluster, according to the cluster type specified in the constructor.
        Sets self.client.
        Also writes useful URLs to graph-links.txt.

        If the 'cluster-type' is 'synchronous', then the cluster will be
        a special stub class (DebugCluster), which provides dummy
        implementations of a few functions from the DistributedCluster API.
        (Mostly just for convenient unit testing.)
        """

        # Consider using client.register_worker_callbacks() to configure
        # - faulthandler (later)
        # - excepthook?
        # - (okay, maybe it's just best to put that stuff in __init__.py, like in DSS)

        load_and_overwrite_dask_config(self.cluster_type, 'dask-config.yaml',
                                       True)
        self._write_driver_graph_urls()

        if self.cluster_type in JOBQUEUE_CLUSTERS:
            update_jobqueue_config_with_defaults(self.cluster_type)

            if self.cluster_type == "lsf":
                from dask_jobqueue import LSFCluster
                cluster = LSFCluster()  #ip='0.0.0.0')
            elif self.cluster_type == "sge":
                from dask_jobqueue import SGECluster
                cluster = SGECluster(ip='0.0.0.0')
            elif self.cluster_type == "slurm":
                from dask_jobqueue import SLURMCluster
                cluster = SLURMCluster(ip='0.0.0.0')
            else:
                raise AssertionError("Unimplemented jobqueue cluster")

            cluster.scale(self.num_workers)

        elif self.cluster_type == "local-cluster":
            cluster = LocalCluster(self.num_workers,
                                   threads_per_worker=1,
                                   processes=True,
                                   ip='0.0.0.0')

        elif self.cluster_type in ("synchronous", "processes"):
            cluster = None
            # synchronous/processes mode is for testing and debugging only
            assert dask.config.get('scheduler', self.cluster_type) == self.cluster_type, \
                "Inconsistency between the dask-config and the scheduler you chose."

            dask.config.set(scheduler=self.cluster_type)
            self.client = DebugClient(self.cluster_type)
        else:
            raise AssertionError("Unknown cluster type")

        dump_dask_config('full-dask-config.yaml')

        if cluster:
            dashboard = cluster.dashboard_link
            logger.info(f"Dashboard running on {dashboard}")
            dashboard_ip = extract_ip_from_link(dashboard)
            dashboard = dashboard.replace(dashboard_ip, socket.gethostname())
            logger.info(f"              a.k.a. {dashboard}")

            # Note: Overrides config value: distributed.comm.timeouts.connect
            self.client = Client(cluster, timeout='60s')

            # Wait for the workers to spin up.
            with Timer(f"Waiting for {self.num_workers} workers to launch",
                       logger) as wait_timer:
                while (self.wait_for_workers
                       and self.client.status == "running"
                       and len(self.client.cluster.scheduler.workers) <
                       self.num_workers):

                    if wait_timer.seconds > (60 * self.cluster_max_wait):
                        msg = (
                            f"Not all cluster workers could be launched within the "
                            "allotted time ({self.cluster_max_wait} minutes).\n"
                            "Try again or adjust the 'cluster-max-wait' setting.\n"
                        )
                        raise RuntimeError(msg)
                    time.sleep(0.1)

            if self.wait_for_workers and self.cluster_type == "lsf":
                self._write_worker_graph_urls('graph-links.txt')