Example #1
0
def main():
    # dask cluster and client
    number_processes = 1
    number_jobs = 35
    number_workers = number_processes * number_jobs

    cluster = SGECluster(
        interface="ib0",
        walltime="04:00:00",
        memory=f"2 G",
        resource_spec=f"h_vmem=2G",
        scheduler_options={
            "dashboard_address": ":2727",
        },
        job_extra=[
            "-cwd",
            "-V",
            f"-pe smp {number_processes}",
            f"-l disk=1G",
        ],
        local_directory=os.sep.join(
            [os.environ.get("PWD"), "dask-worker-space"]),
    )

    client = Client(cluster)
    cluster.scale(jobs=number_jobs)

    # main processing
    print("processing ...")
    results = []
    bag = db.from_sequence(nums, npartitions=number_workers)
    results = bag.map(weird_function).compute()

    print("saving ...")
    joblib.dump(results, f"/nobackup/${USER}/results.joblib")

    client.close()
    cluster.close()
Example #2
0
def init_cluster(name, args):
    resource_spec = "h_vmem={}G,mem_req={}G".format(args.h_vmem, args.mem_req)
    exclude_nodes = "&".join(["!" + x for x in args.exclude_nodes])
    if len(exclude_nodes) > 0:
        exclude_nodes = "#$ -l h=" + exclude_nodes
    env_extra = [
        "#$ -e {}".format(args.log_dir or "/dev/null"),
        "#$ -o {}".format(args.log_dir or "/dev/null"),
        "#$ -pe serial {}".format(args.ngpus if args.ngpus > 0 else args.ncpus),
        exclude_nodes,
        "source " + args.to_source if args.to_source is not None else "",
        "export LANG=en_US.UTF-8",
        "export LC_ALL=en_US.UTF-8",
        "export MKL_NUM_THREADS=1",
        "export NUMEXPR_NUM_THREADS=1",
        "export OMP_NUM_THREADS=1",
        "export DISABLE_MP_CACHE=1",
        "export TORCH_HOME=/sequoia/data1/rriochet/.torch",
    ]
    for var in args.export_var:
        env_extra.append(f'export {var}="{os.environ[var]}"')
    cluster = SGECluster(
        queue=args.queue,
        resource_spec=resource_spec,
        walltime="720:00:00",
        name=name,
        cores=args.ncpus,
        memory="{}G".format(args.mem_req),
        processes=1,
        interface="ib0",
        local_directory=args.log_dir,
        env_extra=env_extra,
        spill_dir=args.spill_dir,
        extra=["--no-nanny"],
    )
    # cluster.adapt(maximum_jobs=args.jobs)
    cluster.scale(args.jobs)
    return cluster
Example #3
0
def test_complex_cancel_command(loop):
    with SGECluster(walltime="00:02:00",
                    cores=1,
                    processes=1,
                    memory="2GB",
                    loop=loop) as cluster:
        with Client(cluster) as client:
            username = "******"
            cluster.cancel_command = "qdel -u {}".format(username)

            cluster.scale(2)

            start = time()
            while not client.scheduler_info()["workers"]:
                sleep(0.100)
                assert time() < start + QUEUE_WAIT

            cluster.scale(0)

            start = time()
            while client.scheduler_info()["workers"]:
                sleep(0.100)
                assert time() < start + QUEUE_WAIT
Example #4
0
def main():
    # dask cluster and client
    number_processes = 1
    number_jobs = 35
    number_workers = number_processes * number_jobs

    cluster = SGECluster(
        interface="ib0",
        walltime="04:00:00",
        memory=f"12 G",
        resource_spec=f"h_vmem=12G",
        scheduler_options={
            "dashboard_address": ":2727",
        },
        job_extra=[
            "-cwd",
            "-V",
            f"-pe smp {number_processes}",
            f"-l disk=1G",
        ],
        local_directory=os.sep.join(
            [os.environ.get("PWD"), "dask-worker-space"]),
    )

    client = Client(cluster)
    cluster.scale(jobs=number_jobs)

    # main processing
    print("processing ...")
    results = []
    bag = db.from_sequence(sims, npartitions=number_workers)
    results = bag.map(create_ozone_metric).compute()
    print("complete")

    client.close()
    cluster.close()
Example #5
0
def run_JK_distributed_massboosted(df, param):
    '''Receives the pandas dataframe with the objects containing the
    temperature decrements and the parameter object and run the kSZ
    statistic and generate Jack Knifes.
    Everything runs in the cluster, so current terminal does not need
    to request many cpus.

    df: dataframe object containing the variables for the calculation
    params: param file for this calculation
    NJK: how many subgroups we will make to run the calculation'''

    Ncores = envVars.Ncores
    NWorkers = envVars.NWorkers
    Ngroups = param.JK_NGROUPS

    #setup cluster
    cluster = SGECluster(
        walltime='172800',
        processes=1,
        cores=1,
        env_extra=[
            '#$-pe sge_pe %i' % Ncores,
            '-l m_core=%i' % Ncores, 'mkdir -p /tmp/pag227/dask/dask-scratch',
            'export NUMBA_NUM_THREADS=%i' % Ncores,
            'export OMP_NUM_THREADS=%i' % Ncores
            #                                    'export OMP_NUM_THREADS=1',  # noqa
        ])
    cluster.scale(NWorkers)
    client = Client(cluster)
    time.sleep(30)
    #end setting up cluster

    #send full dataset to the cluster
    future_fullDataset = client.scatter(df)
    future_params = client.scatter(param)
    res_fullDataset = client.submit(get_pairwise_ksz_massboosted,
                                    future_fullDataset,
                                    future_params,
                                    multithreading=True)
    #done with the full dataset
    jk_results = []
    futureData = []  #data to be sent in jk or bootstrap in galaxy space

    for j in range(Ngroups):
        df_bs = df.copy()
        choose = np.random.choice(len(df), len(df))
        df_bs['dT'] = df.dT.values[choose]
        futureData.append(client.scatter(df_bs))

    if param.JK_RESAMPLING_METHOD.lower() == "bs_dt_mass_boosted_est":
        get_pw_func = get_pairwise_ksz_massboosted
    elif param.JK_RESAMPLING_METHOD.lower(
    ) == 'bs_dt_mass_boosted_est_debiased':  # noqa
        get_pw_func = get_pairwise_ksz_massboosted_debiased

    for j in range(Ngroups):
        jk_results.append(
            client.submit(get_pw_func,
                          futureData[j],
                          future_params,
                          multithreading=True))


# extract results
    fullDataset_results = res_fullDataset.result()
    jk_results = client.gather(jk_results)
    client.close()
    #  cluster.close()

    return fullDataset_results, jk_results
Example #6
0
#client = Client(cluster)  # start local workers as threads

#import ipdb; ipdb.set_trace()

### SGE

#"""
from dask_jobqueue import PBSCluster, SGECluster

q1d_resource_spec = "q_1day=TRUE,io_big=TRUE"
gpu_resource_spec = "q_gpu=TRUE"

cluster = SGECluster(
    queue="q_1day",
    memory='4GB',
    cores=2,
    log_directory="/idiap/user/tpereira/github/snakemaking/dask/bob_bio/logs",
    local_directory="/idiap/user/tpereira/github/snakemaking/dask/bob_bio/logs",
    resource_spec=q1d_resource_spec)

cluster.scale_up(NODES)
client = Client(cluster)  # start local workers as threads
#"""

#############
# HERE I'M TESTING HETEROGENEOUS JOBS
#

### NODES q1d
#for i in range(NODES-1):
# GETTING THE SPEC TEMPLATE#
def main():
    # dask cluster and client
    n_processes = 1
    n_jobs = 35
    n_workers = n_processes * n_jobs

    cluster = SGECluster(
        interface="ib0",
        walltime="01:00:00",
        memory=f"64 G",
        resource_spec=f"h_vmem=64G",
        scheduler_options={
            "dashboard_address": ":5757",
        },
        job_extra=[
            "-cwd",
            "-V",
            f"-pe smp {n_processes}",
            f"-l disk=32G",
        ],
        local_directory=os.sep.join(
            [os.environ.get("PWD"), "dask-worker-scale-space"]),
    )

    client = Client(cluster)

    cluster.scale(jobs=n_jobs)

    time_start = time.time()

    # scale custom outputs
    if normal:
        emission_configs = np.array(
            np.meshgrid(
                np.linspace(0.0, 1.4, 8),
                np.linspace(0.0, 1.4, 8),
                np.linspace(0.0, 1.4, 8),
                np.linspace(0.0, 1.4, 8),
                np.linspace(0.0, 1.4, 8),
            )).T.reshape(-1, 5)
        emission_configs_20percentintervals = []
        for emission_config in emission_configs:
            emission_configs_20percentintervals.append(
                f'RES{round(emission_config[0], 1)}_IND{round(emission_config[1], 1)}_TRA{round(emission_config[2], 1)}_AGR{round(emission_config[3], 1)}_ENE{round(emission_config[4], 1)}'
            )

    if extra:
        custom_inputs_main = [
            np.array([[1.15, 1.27, 0.98, 0.98, 1.36]]),  # bottom-up 2010
            np.array([[1.19, 1.30, 1.01, 1.01, 1.46]]),  # bottom-up 2011
            np.array([[1.20, 1.30, 1.01, 1.02, 1.39]]),  # bottom-up 2012
            np.array([[1.13, 1.29, 1.02, 1.01, 1.29]]),  # bottom-up 2013
            np.array([[1.06, 1.12, 0.99, 1.01, 1.12]]),  # bottom-up 2014
            np.array([[0.92, 0.84, 0.97, 0.99, 0.94]]),  # bottom-up 2016
            np.array([[0.84, 0.81, 0.99, 0.99, 0.89]]),  # bottom-up 2017
            np.array([[0.76, 0.934, 0.735, 0.683, 0.708]]),
            np.array([[0.704, 0.786, 0.73, 0.659, 0.6]]),
            np.array([[0.712, 0.703, 0.725, 0.676, 0.649]]),
            np.array([[0.739, 0.668, 0.701, 0.686, 0.682]]),
            np.array([[0.67, 0.609, 0.709, 0.621, 0.661]]),
            np.array([[0.744, 0.904, 0.778, 0.678, 0.716]]),
            np.array([[0.771, 0.835, 0.711, 0.685, 0.544]]),
            np.array([[0.647, 0.945, 0.746, 0.588, 0.473]]),
            np.array([[0.657, 0.745, 0.714, 0.613, 0.591]]),
            np.array([[0.582, 0.7, 0.672, 0.5, 0.492]]),
            np.array([[0.803, 0.835, 0.742, 0.71, 0.717]]),
            np.array([[0.721, 0.863, 0.712, 0.74, 0.709]]),
            np.array([[0.661, 0.674, 0.694, 0.742, 0.715]]),
            np.array([[0.701, 0.642, 0.669, 0.681, 0.679]]),
            np.array([[0.604, 0.399, 0.659, 0.613, 0.724]]),
            np.array([[0.769, 1.009, 0.697, 0.69, 0.72]]),
            np.array([[0.824, 0.759, 0.767, 0.641, 0.429]]),
            np.array([[0.858, 1.092, 0.794, 0.604, 0.475]]),
            np.array([[0.8, 0.987, 0.648, 0.57, 0.493]]),
            np.array([[0.867, 0.957, 0.677, 0.558, 0.477]])
        ]
        custom_inputs = []
        for custom_input in custom_inputs_main:
            custom_input_res = np.copy(custom_input)
            custom_input_ind = np.copy(custom_input)
            custom_input_tra = np.copy(custom_input)
            custom_input_agr = np.copy(custom_input)
            custom_input_ene = np.copy(custom_input)
            custom_input_nores = np.copy(custom_input)
            custom_input_noind = np.copy(custom_input)
            custom_input_notra = np.copy(custom_input)
            custom_input_noagr = np.copy(custom_input)
            custom_input_noene = np.copy(custom_input)
            custom_input_resonly = np.copy(custom_input)
            custom_input_indonly = np.copy(custom_input)
            custom_input_traonly = np.copy(custom_input)
            custom_input_agronly = np.copy(custom_input)
            custom_input_eneonly = np.copy(custom_input)

            custom_input_res[0][1:] = 1.0
            custom_input_ind[0][0] = 1.0
            custom_input_ind[0][2:] = 1.0
            custom_input_tra[0][:2] = 1.0
            custom_input_tra[0][3:] = 1.0
            custom_input_agr[0][:3] = 1.0
            custom_input_agr[0][4:] = 1.0
            custom_input_ene[0][:4] = 1.0

            custom_input_nores[0][0] = 0.0
            custom_input_noind[0][1] = 0.0
            custom_input_notra[0][2] = 0.0
            custom_input_noagr[0][3] = 0.0
            custom_input_noene[0][4] = 0.0

            custom_input_resonly[0][1:] = 0.0
            custom_input_indonly[0][0] = 0.0
            custom_input_indonly[0][2:] = 0.0
            custom_input_traonly[0][:2] = 0.0
            custom_input_traonly[0][3:] = 0.0
            custom_input_agronly[0][:3] = 0.0
            custom_input_agronly[0][4:] = 0.0
            custom_input_eneonly[0][:4] = 0.0

            custom_inputs.append(custom_input)
            custom_inputs.append(custom_input_res)
            custom_inputs.append(custom_input_ind)
            custom_inputs.append(custom_input_tra)
            custom_inputs.append(custom_input_agr)
            custom_inputs.append(custom_input_ene)
            custom_inputs.append(custom_input_nores)
            custom_inputs.append(custom_input_noind)
            custom_inputs.append(custom_input_notra)
            custom_inputs.append(custom_input_noagr)
            custom_inputs.append(custom_input_noene)
            custom_inputs.append(custom_input_resonly)
            custom_inputs.append(custom_input_indonly)
            custom_inputs.append(custom_input_traonly)
            custom_inputs.append(custom_input_agronly)
            custom_inputs.append(custom_input_eneonly)

        emission_configs_20percentintervals = []
        for custom_input in custom_inputs:
            emission_config = f'RES{custom_input[0][0]:0.3f}_IND{custom_input[0][1]:0.3f}_TRA{custom_input[0][2]:0.3f}_AGR{custom_input[0][3]:0.3f}_ENE{custom_input[0][4]:0.3f}'
            emission_configs_20percentintervals.append(emission_config)

    if climate_cobenefits:
        custom_inputs_main = [
            np.array([[0.91, 0.95, 0.85, 1.05, 0.96]]),  # Base_CLE_2020
            np.array([[0.91, 0.95, 0.85, 1.05, 0.96]]),  # Base_MFR_2020
            np.array([[0.91, 0.95, 0.85, 1.05, 0.96]]),  # SDS_MFR_2020
            np.array([[0.68, 0.84, 0.71, 1.16, 0.93]]),  # Base_CLE_2030
            np.array([[0.33, 0.47, 0.48, 0.81, 0.69]]),  # Base_MFR_2030
            np.array([[0.27, 0.45, 0.41, 0.81, 0.55]]),  # SDS_MFR_2030
            np.array([[0.57, 0.75, 0.69, 1.2, 0.94]]),  # Base_CLE_2040
            np.array([[0.24, 0.41, 0.31, 0.83, 0.73]]),  # Base_MFR_2040
            np.array([[0.19, 0.38, 0.22, 0.83, 0.5]]),  # SDS_MFR_2040
            np.array([[0.52, 0.72, 0.65, 1.24, 0.91]]),  # Base_CLE_2050
            np.array([[0.2, 0.38, 0.29, 0.86, 0.72]]),  # Base_MFR_2050
            np.array([[0.18, 0.35, 0.2, 0.86, 0.46]]),  # SDS_MFR_2050
        ]
        custom_inputs = []
        for custom_input in custom_inputs_main:
            custom_input_res = np.copy(custom_input)
            custom_input_ind = np.copy(custom_input)
            custom_input_tra = np.copy(custom_input)
            custom_input_agr = np.copy(custom_input)
            custom_input_ene = np.copy(custom_input)
            custom_input_nores = np.copy(custom_input)
            custom_input_noind = np.copy(custom_input)
            custom_input_notra = np.copy(custom_input)
            custom_input_noagr = np.copy(custom_input)
            custom_input_noene = np.copy(custom_input)
            custom_input_resonly = np.copy(custom_input)
            custom_input_indonly = np.copy(custom_input)
            custom_input_traonly = np.copy(custom_input)
            custom_input_agronly = np.copy(custom_input)
            custom_input_eneonly = np.copy(custom_input)

            custom_input_res[0][1:] = 1.0
            custom_input_ind[0][0] = 1.0
            custom_input_ind[0][2:] = 1.0
            custom_input_tra[0][:2] = 1.0
            custom_input_tra[0][3:] = 1.0
            custom_input_agr[0][:3] = 1.0
            custom_input_agr[0][4:] = 1.0
            custom_input_ene[0][:4] = 1.0

            custom_input_nores[0][0] = 0.0
            custom_input_noind[0][1] = 0.0
            custom_input_notra[0][2] = 0.0
            custom_input_noagr[0][3] = 0.0
            custom_input_noene[0][4] = 0.0

            custom_input_resonly[0][1:] = 0.0
            custom_input_indonly[0][0] = 0.0
            custom_input_indonly[0][2:] = 0.0
            custom_input_traonly[0][:2] = 0.0
            custom_input_traonly[0][3:] = 0.0
            custom_input_agronly[0][:3] = 0.0
            custom_input_agronly[0][4:] = 0.0
            custom_input_eneonly[0][:4] = 0.0

            custom_inputs.append(custom_input)
            custom_inputs.append(custom_input_res)
            custom_inputs.append(custom_input_ind)
            custom_inputs.append(custom_input_tra)
            custom_inputs.append(custom_input_agr)
            custom_inputs.append(custom_input_ene)
            custom_inputs.append(custom_input_nores)
            custom_inputs.append(custom_input_noind)
            custom_inputs.append(custom_input_notra)
            custom_inputs.append(custom_input_noagr)
            custom_inputs.append(custom_input_noene)
            custom_inputs.append(custom_input_resonly)
            custom_inputs.append(custom_input_indonly)
            custom_inputs.append(custom_input_traonly)
            custom_inputs.append(custom_input_agronly)
            custom_inputs.append(custom_input_eneonly)

        emission_configs_20percentintervals = []
        for custom_input in custom_inputs:
            emission_config = f'RES{custom_input[0][0]:0.3f}_IND{custom_input[0][1]:0.3f}_TRA{custom_input[0][2]:0.3f}_AGR{custom_input[0][3]:0.3f}_ENE{custom_input[0][4]:0.3f}'
            emission_configs_20percentintervals.append(emission_config)

    if top_down_2020_baseline:
        emission_config_2020_baseline = np.array(
            [0.604, 0.399, 0.659, 0.613,
             0.724])  # matching to PM2.5 only, top 1,000
        emission_configs = np.array(
            np.meshgrid(
                np.linspace(
                    emission_config_2020_baseline[0] * 0.50,
                    emission_config_2020_baseline[0], 6
                ),  # 10% reduction increments from 2020 baseline up to 50%
                np.linspace(emission_config_2020_baseline[1] * 0.50,
                            emission_config_2020_baseline[1], 6),
                np.linspace(emission_config_2020_baseline[2] * 0.50,
                            emission_config_2020_baseline[2], 6),
                np.linspace(emission_config_2020_baseline[3] * 0.50,
                            emission_config_2020_baseline[3], 6),
                np.linspace(emission_config_2020_baseline[4] * 0.50,
                            emission_config_2020_baseline[4], 6),
            )).T.reshape(-1, 5)
        # add a couple more for larger reductions in RES and IND to reach WHO-IT2
        emission_configs = list(emission_configs)
        emission_configs.append(np.array([0.242, 0.160, 0.659, 0.613, 0.724]))
        emission_configs.append(np.array([0.181, 0.120, 0.659, 0.613, 0.724]))
        emission_configs.append(np.array([0.121, 0.080, 0.659, 0.613, 0.724]))
        emission_configs.append(np.array([0.060, 0.040, 0.659, 0.613, 0.724]))

        emission_configs_20percentintervals = []
        for emission_config in emission_configs:
            emission_configs_20percentintervals.append(
                f'RES{round(emission_config[0], 3):.3f}_IND{round(emission_config[1], 3):.3f}_TRA{round(emission_config[2], 3):.3f}_AGR{round(emission_config[3], 3):.3f}_ENE{round(emission_config[4], 3):.3f}'
            )

    emission_configs_completed = glob.glob(
        f"/nobackup/earlacoa/machinelearning/data_annual/predictions/{output}_adjusted_scaled/ds*{output}_popgrid_0.25deg_adjusted_scaled.nc"
    )
    emission_configs_completed = [
        f"{item[88:-45]}" for item in emission_configs_completed
    ]

    emission_configs_20percentintervals_remaining_set = set(
        emission_configs_20percentintervals) - set(emission_configs_completed)
    emission_configs_remaining = [
        item for item in emission_configs_20percentintervals_remaining_set
    ]
    print(
        f"custom outputs remaining for {output}: {len(emission_configs_remaining)} - 20% intervals with {int(100 * len(emission_configs_20percentintervals_remaining_set) / len(emission_configs_20percentintervals))}% remaining"
    )

    # dask bag and process
    emission_configs_remaining = emission_configs_remaining[:35000]
    print(
        f"predicting for {len(emission_configs_remaining)} custom outputs ...")
    bag_emission_configs = db.from_sequence(emission_configs_remaining,
                                            npartitions=n_workers)
    bag_emission_configs.map(scale).compute()

    time_end = time.time() - time_start
    print(
        f"completed in {time_end:0.2f} seconds, or {time_end / 60:0.2f} minutes, or {time_end / 3600:0.2f} hours"
    )
    print(
        f"average time per custom output is {time_end / len(emission_configs_remaining):0.2f} seconds"
    )

    client.close()
    cluster.close()
Example #8
0
def run_JK_distributed(df, param, randomize=True):
    '''Receives the pandas dataframe with the objects containing the
    temperature decrements and the parameter object and run the kSZ
    statistic and generate Jack Knifes.
    Everything runs in the cluster, so current terminal does not need
    to request many cpus.

    df: dataframe object containing the variables for the calculation
    params: param file for this calculation
    NJK: how many subgroups we will make to run the calculation
    randomize: shuffle data before running the JK'''

    Ncores = envVars.Ncores
    NWorkers = envVars.NWorkers
    Ngroups = param.JK_NGROUPS
    resampling_method = param.JK_RESAMPLING_METHOD.lower()

    #setup cluster
    cluster = SGECluster(walltime='172800', processes=1, cores=1,
                         env_extra=['#$-pe sge_pe %i' % Ncores,
                                    '-l m_core=%i' % Ncores,
                                    'mkdir -p /tmp/pag227/dask/dask-scratch',
                                    'export NUMBA_NUM_THREADS=%i' % Ncores,
                                    'export OMP_NUM_THREADS=%i' % Ncores
#                                    'export OMP_NUM_THREADS=1',  # noqa
                                    ])
    cluster.scale(NWorkers)
    client = Client(cluster)
    time.sleep(30)
    #end setting up cluster

    #send full dataset to the cluster
    future_fullDataset = client.scatter(df)
    future_params = client.scatter(param)
    res_fullDataset = client.submit(pairwiser.get_pairwise_ksz,
                                    future_fullDataset,
                                    future_params, multithreading=True)
    #done with the full dataset

    #iterate over partial dataset for the JK
    if JK == resampling_method:
        indices_toDrop = JK_tools.indicesToDrop(df, Ngroups,
                                                randomize=randomize)
    jk_results = []
    futureData = []  #data to be sent in jk or bootstrap in galaxy space

    if (JK == resampling_method) or (BS == resampling_method):
        for j in range(Ngroups):  # submit data to the cluster
            if JK in resampling_method:  # if method jk
                dataJK = df.drop(indices_toDrop[j], inplace=False)
                futureData.append(client.scatter(dataJK))
            elif BS in resampling_method:
                dataBS = df.sample(len(df), replace=True)
                futureData.append(client.scatter(dataBS))
        #Now do the JK calculation
        for j in range(Ngroups):
            jk_results.append(client.submit(pairwiser.get_pairwise_ksz,
                              futureData[j],
                              future_params, multithreading=True))

    if BS_PW == resampling_method:  # submit the same dataset
        futureData = client.scatter(df, broadcast=True)

        for j in range(Ngroups):
            jk_results.append(client.submit(bs_pw.get_bootstrap_pairwise,
                                            futureData,
                                            future_params,
                                            multithreading=True,
                                            pure=False))
    if resampling_method == BS_DT:
        for j in range(Ngroups):
            df_bs = df.copy()
            choose = np.random.choice(len(df), len(df))
            df_bs['dT'] = df.dT.values[choose]
            futureData.append(client.scatter(df_bs))
        for j in range(Ngroups):
            jk_results.append(client.submit(pairwiser.get_pairwise_ksz,
                                            futureData[j],
                                            future_params,
                                            multithreading=True))

    if resampling_method == TL_JK:
        tiled_JK.classify_grid(df)
        df = tiled_JK.remove_edge_galaxies(df, tol_sigma=1.5)
        Ntiles = tiled_JK.how_many_tiles(df)
        for j in range(Ntiles):
            df_tosubmit = tiled_JK.remove_tile(df, j)
            futureData.append(client.scatter(df_tosubmit))
        for j in range(Ntiles):
            jk_results.append(client.submit(pairwiser.get_pairwise_ksz,
                                            futureData[j],
                                            future_params,
                                            multithreading=True))
    #extract results
    fullDataset_results = res_fullDataset.result()
    jk_results = client.gather(jk_results)
    client.close()
#    cluster.close()

    return fullDataset_results, jk_results

if __name__ == '__main__':

    data = "./clustering_Biden/"

    N = 10000000

    #n_workers = int(os.getenv('NSLOTS'))
    #print("n_workers: ", n_workers)
    #from dask.distributed import LocalCluster
    #cluster = LocalCluster()

    from dask_jobqueue import SGECluster
    cluster = SGECluster(cores=28,
                         memory='200GB',
                         job_extra=['-P scv', '-pe mpi_28_tasks_per_node 112'])
    #cluster.scale(4)

    from dask.distributed import Client, progress
    #client = Client("tcp://192.168.19.195:8786")
    client = Client(cluster)
    print(client)

    entries = os.listdir(data)

    #from multiprocessing import Process, Lock, Manager
    # About 1min/file, may need parallization later.
    if (not path.exists("./dict_A.pickle")):
        t1 = time.time()
        # A dictionary that holds all records in A
Example #10
0
    def execute(self, pipeline_context, execution_plan):
        check.inst_param(pipeline_context, "pipeline_context",
                         SystemPipelineExecutionContext)
        check.inst_param(execution_plan, "execution_plan", ExecutionPlan)
        check.param_invariant(
            isinstance(pipeline_context.executor, DaskExecutor),
            "pipeline_context",
            "Expected executor to be DaskExecutor got {}".format(
                pipeline_context.executor),
        )

        check.invariant(
            pipeline_context.instance.is_persistent,
            "Dask execution requires a persistent DagsterInstance",
        )

        step_levels = execution_plan.execution_step_levels()

        pipeline_name = pipeline_context.pipeline_def.name

        instance = pipeline_context.instance

        cluster_type = self.cluster_type
        if cluster_type == "local":
            from dask.distributed import LocalCluster

            cluster = LocalCluster(**self.build_dict(pipeline_name))
        elif cluster_type == "yarn":
            from dask_yarn import YarnCluster

            cluster = YarnCluster(**self.build_dict(pipeline_name))
        elif cluster_type == "ssh":
            from dask.distributed import SSHCluster

            cluster = SSHCluster(**self.build_dict(pipeline_name))
        elif cluster_type == "pbs":
            from dask_jobqueue import PBSCluster

            cluster = PBSCluster(**self.build_dict(pipeline_name))
        elif cluster_type == "moab":
            from dask_jobqueue import MoabCluster

            cluster = MoabCluster(**self.build_dict(pipeline_name))
        elif cluster_type == "sge":
            from dask_jobqueue import SGECluster

            cluster = SGECluster(**self.build_dict(pipeline_name))
        elif cluster_type == "lsf":
            from dask_jobqueue import LSFCluster

            cluster = LSFCluster(**self.build_dict(pipeline_name))
        elif cluster_type == "slurm":
            from dask_jobqueue import SLURMCluster

            cluster = SLURMCluster(**self.build_dict(pipeline_name))
        elif cluster_type == "oar":
            from dask_jobqueue import OARCluster

            cluster = OARCluster(**self.build_dict(pipeline_name))
        elif cluster_type == "kube":
            from dask_kubernetes import KubeCluster

            cluster = KubeCluster(**self.build_dict(pipeline_name))
        else:
            raise ValueError(
                f"Must be providing one of the following ('local', 'yarn', 'ssh', 'pbs', 'moab', 'sge', 'lsf', 'slurm', 'oar', 'kube') not {cluster_type}"
            )

        with dask.distributed.Client(cluster) as client:
            execution_futures = []
            execution_futures_dict = {}

            for step_level in step_levels:
                for step in step_level:
                    # We ensure correctness in sequencing by letting Dask schedule futures and
                    # awaiting dependencies within each step.
                    dependencies = []
                    for step_input in step.step_inputs:
                        for key in step_input.dependency_keys:
                            dependencies.append(execution_futures_dict[key])

                    run_config = dict(pipeline_context.run_config,
                                      execution={"in_process": {}})
                    recon_repo = pipeline_context.pipeline.get_reconstructable_repository(
                    )

                    dask_task_name = "%s.%s" % (pipeline_name, step.key)

                    recon_pipeline = recon_repo.get_reconstructable_pipeline(
                        pipeline_name)

                    future = client.submit(
                        query_on_dask_worker,
                        dependencies,
                        recon_pipeline,
                        pipeline_context.pipeline_run,
                        run_config,
                        [step.key],
                        pipeline_context.mode_def.name,
                        instance.get_ref(),
                        key=dask_task_name,
                        resources=get_dask_resource_requirements(step.tags),
                    )

                    execution_futures.append(future)
                    execution_futures_dict[step.key] = future

            # This tells Dask to awaits the step executions and retrieve their results to the
            # master
            futures = dask.distributed.as_completed(execution_futures,
                                                    with_results=True)

            # Allow interrupts while waiting for the results from Dask
            for future, result in iterate_with_context(
                    raise_interrupts_immediately, futures):
                for step_event in result:
                    check.inst(step_event, DagsterEvent)
                    yield step_event
Example #11
0
def run_error_estimation_distributed(df1, df2, param):
    Ncores = envVars.Ncores
    NWorkers = envVars.NWorkers
    Ngroups = param.JK_NGROUPS

    #setup cluster
    cluster = SGECluster(
        walltime='172800',
        processes=1,
        cores=1,
        env_extra=[
            '#$-pe sge_pe %i' % Ncores,
            '-l m_core=%i' % Ncores, 'mkdir -p /tmp/pag227/dask/dask-scratch',
            'export NUMBA_NUM_THREADS=%i' % Ncores,
            'export OMP_NUM_THREADS=%i' % Ncores
            #                                    'export OMP_NUM_THREADS=1',  # noqa
        ])
    cluster.scale(NWorkers)
    client = Client(cluster)
    time.sleep(10)
    #end setting up cluster

    #send full dataset to the cluster
    future_df1 = client.scatter(df1)
    future_df2 = client.scatter(df2)

    future_params = client.scatter(param)
    res_fullDataset_11 = client.submit(cpw.get_cross_pairwise_ksz, future_df1,
                                       future_df1, future_params)
    res_fullDataset_12 = client.submit(cpw.get_cross_pairwise_ksz, future_df1,
                                       future_df2, future_params)
    res_fullDataset_22 = client.submit(cpw.get_cross_pairwise_ksz, future_df2,
                                       future_df2, future_params)
    #done with the full dataset

    #iterate over partial dataset for the JK
    replicants1 = []  #data to be sent
    replicants2 = []

    if 'jk' in param.JK_RESAMPLING_METHOD.lower():
        all_indx = np.arange(len(df1))
        np.random.shuffle(all_indx)
        indx_to_drop = np.array_split(all_indx, param.JK_NGROUPS)
    for j in range(Ngroups):  # submit data to the cluster
        if 'jk' in param.JK_RESAMPLING_METHOD.lower():  # if method jk
            todrop = indx_to_drop[j]
            replicant1 = df1.drop(df1.index[todrop], inplace=False)
            replicant2 = df2.drop(df2.index[todrop], inplace=False)

            replicants1.append(client.scatter(replicant1))
            replicants2.append(client.scatter(replicant2))
        elif 'bootstrap' in param.JK_RESAMPLING_METHOD.lower():
            indxs = np.random.randint(low=0, high=len(df1), size=len(df1))
            replicant1 = df1.iloc[indxs]
            replicant2 = df2.iloc[indxs]
            replicants1.append(client.scatter(replicant1))
            replicants2.append(client.scatter(replicant2))

    #Now do the JK calculation
    realizations11 = []
    realizations12 = []
    realizations22 = []

    for j in range(Ngroups):
        realizations11.append(
            client.submit(cpw.get_cross_pairwise_ksz, replicants1[j],
                          replicants1[j], future_params))
        realizations12.append(
            client.submit(cpw.get_cross_pairwise_ksz, replicants1[j],
                          replicants2[j], future_params))
        realizations22.append(
            client.submit(cpw.get_cross_pairwise_ksz, replicants2[j],
                          replicants2[j], future_params))
    #extract results
    fullDataset_result11 = res_fullDataset_11.result()
    fullDataset_result12 = res_fullDataset_12.result()
    fullDataset_result22 = res_fullDataset_22.result()

    resampling_result11 = client.gather(realizations11)
    resampling_result12 = client.gather(realizations12)
    resampling_result22 = client.gather(realizations22)
    client.close()
    #    cluster.close()

    results = {
        'full11': fullDataset_result11,
        'full12': fullDataset_result12,
        'full22': fullDataset_result22,
        'resampled11': resampling_result11,
        'resampled12': resampling_result12,
        'resampled22': resampling_result22
    }

    return results
Example #12
0
    def execute(self, pipeline_context, execution_plan):
        check.inst_param(pipeline_context, 'pipeline_context',
                         SystemPipelineExecutionContext)
        check.inst_param(execution_plan, 'execution_plan', ExecutionPlan)
        check.param_invariant(
            isinstance(pipeline_context.executor, DaskExecutor),
            'pipeline_context',
            'Expected executor to be DaskExecutor got {}'.format(
                pipeline_context.executor),
        )

        # Checks to ensure storage is compatible with Dask configuration
        storage = pipeline_context.run_config.get('storage')
        check.invariant(storage.keys(),
                        'Must specify storage to use Dask execution')

        check.invariant(
            pipeline_context.instance.is_persistent,
            'Dask execution requires a persistent DagsterInstance',
        )

        # https://github.com/dagster-io/dagster/issues/2440
        check.invariant(
            pipeline_context.system_storage_def.is_persistent,
            'Cannot use in-memory storage with Dask, use filesystem, S3, or GCS',
        )

        step_levels = execution_plan.execution_step_levels()

        pipeline_name = pipeline_context.pipeline_def.name

        instance = pipeline_context.instance

        cluster_type = self.cluster_type
        if cluster_type == 'local':
            from dask.distributed import LocalCluster

            cluster = LocalCluster(**self.build_dict(pipeline_name))
        elif cluster_type == 'yarn':
            from dask_yarn import YarnCluster

            cluster = YarnCluster(**self.build_dict(pipeline_name))
        elif cluster_type == 'ssh':
            from dask.distributed import SSHCluster

            cluster = SSHCluster(**self.build_dict(pipeline_name))
        elif cluster_type == 'pbs':
            from dask_jobqueue import PBSCluster

            cluster = PBSCluster(**self.build_dict(pipeline_name))
        elif cluster_type == 'moab':
            from dask_jobqueue import MoabCluster

            cluster = MoabCluster(**self.build_dict(pipeline_name))
        elif cluster_type == 'sge':
            from dask_jobqueue import SGECluster

            cluster = SGECluster(**self.build_dict(pipeline_name))
        elif cluster_type == 'lsf':
            from dask_jobqueue import LSFCluster

            cluster = LSFCluster(**self.build_dict(pipeline_name))
        elif cluster_type == 'slurm':
            from dask_jobqueue import SLURMCluster

            cluster = SLURMCluster(**self.build_dict(pipeline_name))
        elif cluster_type == 'oar':
            from dask_jobqueue import OARCluster

            cluster = OARCluster(**self.build_dict(pipeline_name))
        elif cluster_type == 'kube':
            from dask_kubernetes import KubeCluster

            cluster = KubeCluster(**self.build_dict(pipeline_name))
        else:
            raise ValueError(
                f"Must be providing one of the following ('local', 'yarn', 'ssh', 'pbs', 'moab', 'sge', 'lsf', 'slurm', 'oar', 'kube') not {cluster_type}"
            )

        with dask.distributed.Client(cluster) as client:
            execution_futures = []
            execution_futures_dict = {}

            for step_level in step_levels:
                for step in step_level:
                    # We ensure correctness in sequencing by letting Dask schedule futures and
                    # awaiting dependencies within each step.
                    dependencies = []
                    for step_input in step.step_inputs:
                        for key in step_input.dependency_keys:
                            dependencies.append(execution_futures_dict[key])

                    run_config = dict(pipeline_context.run_config,
                                      execution={'in_process': {}})
                    recon_repo = pipeline_context.pipeline.get_reconstructable_repository(
                    )
                    variables = {
                        'executionParams': {
                            'selector': {
                                'pipelineName': pipeline_name,
                                'repositoryName':
                                recon_repo.get_definition().name,
                                'repositoryLocationName': '<<in_process>>',
                            },
                            'runConfigData': run_config,
                            'mode': pipeline_context.mode_def.name,
                            'executionMetadata': {
                                'runId': pipeline_context.pipeline_run.run_id
                            },
                            'stepKeys': [step.key],
                        }
                    }

                    dask_task_name = '%s.%s' % (pipeline_name, step.key)

                    workspace = create_in_process_ephemeral_workspace(
                        pointer=pipeline_context.pipeline.
                        get_reconstructable_repository().pointer)

                    future = client.submit(
                        query_on_dask_worker,
                        workspace,
                        variables,
                        dependencies,
                        instance.get_ref(),
                        key=dask_task_name,
                        resources=get_dask_resource_requirements(step.tags),
                    )

                    execution_futures.append(future)
                    execution_futures_dict[step.key] = future

            # This tells Dask to awaits the step executions and retrieve their results to the
            # master
            for future in dask.distributed.as_completed(execution_futures):
                for step_event in future.result():
                    check.inst(step_event, DagsterEvent)

                    yield step_event
def run_dask(options: dict,
        docker_username: str = None,
        docker_password: str = None,
        docker: bool = False,
        slurm_job_array: bool = False):
    try:
        if 'jobqueue' not in options:
            cluster = LocalCluster()
        else:
            jobqueue = options['jobqueue']
            gpus = options['gpus'] if 'gpus' in options else 0
            if 'slurm' in jobqueue:
                print("Requesting SLURM cluster:")
                pprint(jobqueue['slurm'])
                cluster = SLURMCluster(job_extra=[f"--gres=gpu:{gpus}"], **jobqueue['slurm']) if gpus else SLURMCluster(**jobqueue['slurm'])
            elif 'pbs' in jobqueue:
                print("Requesting PBS cluster:")
                pprint(jobqueue['pbs'])
                cluster = PBSCluster(job_extra=[f"--gres=gpu:{gpus}"], **jobqueue['pbs']) if gpus else PBSCluster(**jobqueue['pbs'])
            elif 'moab' in jobqueue:
                print("Requesting MOAB cluster:")
                pprint(jobqueue['moab'])
                cluster = MoabCluster(job_extra=[f"--gres=gpu:{gpus}"], **jobqueue['moab']) if gpus else MoabCluster(**jobqueue['moab'])
            elif 'sge' in jobqueue:
                print("Requesting SGE cluster:")
                pprint(jobqueue['sge'])
                cluster = SGECluster(job_extra=[f"--gres=gpu:{gpus}"], **jobqueue['sge']) if gpus else SGECluster(**jobqueue['sge'])
            elif 'lsf' in jobqueue:
                print("Requesting LSF cluster:")
                pprint(jobqueue['lsf'])
                cluster = LSFCluster(job_extra=[f"--gres=gpu:{gpus}"], **jobqueue['lsf']) if gpus else LSFCluster(**jobqueue['lsf'])
            elif 'oar' in jobqueue:
                print("Requesting OAR cluster:")
                pprint(jobqueue['oar'])
                cluster = OARCluster(job_extra=[f"--gres=gpu:{gpus}"], **jobqueue['oar']) if gpus else OARCluster(**jobqueue['oar'])
            else:
                raise ValueError(f"Unsupported jobqueue configuration: {jobqueue}")

            print(f"Cluster job script: {cluster.job_script()}")

        if 'output' in options and 'from' in options['output']: output_path = options['output']['from']
        else: output_path = '.'

        if 'input' not in options:
            env = options['env'] if 'env' in options else []
            params = options['parameters'] if 'parameters' in options else []
            bind_mounts = options['bind_mounts'] if 'bind_mounts' in options else []
            no_cache = options['no_cache'] if 'no_cache' in options else False
            gpus = options['gpus'] if 'gpus' in options else 0

            if 'jobqueue' in options: cluster.scale(1)
            with Client(cluster) as client:
                command = prep_command(
                    work_dir=options['workdir'],
                    image=options['image'],
                    command=options['command'],
                    env=env + [{'key': 'INDEX', 'value': 1}],
                    parameters=params + [{'key': 'OUTPUT', 'value': output_path}],
                    bind_mounts=bind_mounts,
                    no_cache=no_cache,
                    gpus=gpus,
                    docker_username=docker_username,
                    docker_password=docker_password,
                    docker=docker)

                logger.info(f"Submitting container")
                future = submit_command(client, command, options['log_file'] if 'log_file' in options else None, 3)
                future.result()
                if future.status != 'finished':
                    logger.error(f"Container failed: {future.exception}")
                else:
                    logger.info(f"Container completed")
        elif options['input']['kind'] == InputKind.DIRECTORY:
            input_path = options['input']['path']
            env = options['env'] if 'env' in options else []
            params = options['parameters'] if 'parameters' in options else []
            bind_mounts = options['bind_mounts'] if 'bind_mounts' in options else []
            no_cache = options['no_cache'] if 'no_cache' in options else False
            gpus = options['gpus'] if 'gpus' in options else 0

            if 'jobqueue' in options: cluster.scale(1)
            with Client(cluster) as client:
                command = prep_command(
                    work_dir=options['workdir'],
                    image=options['image'],
                    command=options['command'],
                    env=env + [{'key': 'INDEX', 'value': 1}],
                    parameters=params + [{'key': 'INPUT', 'value': input_path}, {'key': 'OUTPUT', 'value': output_path}],
                    bind_mounts=bind_mounts,
                    no_cache=no_cache,
                    gpus=gpus,
                    docker_username=docker_username,
                    docker_password=docker_password,
                    docker=docker)

                logger.info(f"Submitting container for directory '{input_path}'")
                future = submit_command(client, command, options['log_file'] if 'log_file' in options else None, 3)
                future.result()
                if future.status != 'finished':
                    logger.error(f"Container failed for directory '{input_path}': {future.exception}")
                else:
                    logger.info(f"Container completed for directory '{input_path}'")
        elif options['input']['kind'] == InputKind.FILES:
            input_path = options['input']['path']
            if slurm_job_array:
                files = os.listdir(input_path)
                file_id = int(os.environ.get('SLURM_ARRAY_TASK_ID'))
                current_file = files[file_id]

                env = options['env'] if 'env' in options else []
                params = options['parameters'] if 'parameters' in options else []
                patterns = options['input']['patterns'] if 'patterns' in options['input'] else []
                bind_mounts = options['bind_mounts'] if 'bind_mounts' in options else []
                no_cache = options['no_cache'] if 'no_cache' in options else False
                gpus = options['gpus'] if 'gpus' in options else 0

                if 'jobqueue' in options: cluster.scale(1)
                with Client(cluster) as client:
                    command = prep_command(
                        work_dir=options['workdir'],
                        image=options['image'],
                        command=options['command'],
                        env=env + [{'key': 'INDEX', 'value': file_id}] + [{'key': 'PATTERNS', 'value': ','.join(patterns)}],
                        parameters=params + [{'key': 'INPUT', 'value': join(input_path, current_file)}, {'key': 'OUTPUT', 'value': output_path}],
                        bind_mounts=bind_mounts,
                        no_cache=no_cache,
                        gpus=gpus,
                        docker_username=docker_username,
                        docker_password=docker_password,
                        docker=docker)

                    logger.info(f"Submitting container for file '{input_path}'")
                    future = submit_command(client, command, options['log_file'] if 'log_file' in options else None, 3)
                    future.result()
                    if future.status != 'finished':
                        logger.error(f"Container failed for file '{input_path}': {future.exception}")
                    else:
                        logger.info(f"Container completed for file '{input_path}'")

                logger.info(f"Run succeeded")
            else:
                files = os.listdir(input_path)
                count = len(files)
                futures = []

                if 'jobqueue' not in options:
                    logger.info(f"Processing {count} files in '{input_path}'")
                else:
                    logger.info(f"Requesting {count} nodes to process {count} files in '{input_path}' with job script:\n{cluster.job_script()}")
                    cluster.scale(count)

                env = options['env'] if 'env' in options else []
                params = deepcopy(options['parameters']) if 'parameters' in options else []
                patterns = options['input']['patterns'] if 'patterns' in options['input'] else []
                bind_mounts = options['bind_mounts'] if 'bind_mounts' in options else []
                no_cache = options['no_cache'] if 'no_cache' in options else False
                gpus = options['gpus'] if 'gpus' in options else 0

                with Client(cluster) as client:
                    num_files = len(files)
                    for i, current_file in tqdm.tqdm(enumerate(files), total=num_files):
                        command = prep_command(
                            work_dir=options['workdir'],
                            image=options['image'],
                            command=options['command'],
                            env=env + [{'key': 'INDEX', 'value': i}] + [{'key': 'PATTERNS', 'value': ','.join(patterns)}],
                            parameters=params + [{'key': 'INPUT', 'value': join(input_path, current_file)}, {'key': 'OUTPUT', 'value': output_path}],
                            bind_mounts=bind_mounts,
                            no_cache=no_cache,
                            gpus=gpus,
                            docker_username=docker_username,
                            docker_password=docker_password,
                            docker=docker)

                        logger.info(f"Submitting container for file {i}")
                        futures.append(submit_command(client, command, options['log_file'] if 'log_file' in options else None, 3))

                    finished = 0
                    for future in tqdm.tqdm(as_completed(futures), total=num_files):
                        finished += 1
                        if future.status != 'finished':
                            logger.error(f"Container failed for file {finished}: {future.exception}")
                        else:
                            logger.info(f"Container completed for file {finished}")
        elif options['input']['kind'] == InputKind.FILE:
            input_path = options['input']['path']
            env = options['env'] if 'env' in options else []
            params = options['parameters'] if 'parameters' in options else []
            patterns = options['input']['patterns'] if 'patterns' in options['input'] else []
            bind_mounts = options['bind_mounts'] if 'bind_mounts' in options else []
            no_cache = options['no_cache'] if 'no_cache' in options else False
            gpus = options['gpus'] if 'gpus' in options else 0

            if 'jobqueue' in options: cluster.scale(1)
            with Client(cluster) as client:
                command = prep_command(
                    work_dir=options['workdir'],
                    image=options['image'],
                    command=options['command'],
                    env=env + [{'key': 'INDEX', 'value': 1}] + [{'key': 'PATTERNS', 'value': ','.join(patterns)}],
                    parameters=params + [{'key': 'INPUT', 'value': input_path}, {'key': 'OUTPUT', 'value': output_path}],
                    bind_mounts=bind_mounts,
                    no_cache=no_cache,
                    gpus=gpus,
                    docker_username=docker_username,
                    docker_password=docker_password,
                    docker=docker)

                logger.info(f"Submitting container for file 1")
                future = submit_command(client, command, options['log_file'] if 'log_file' in options else None, 3)
                future.result()
                if future.status != 'finished':
                    logger.error(f"Container failed for file 1")
                    logger.error(future.exception)
                else:
                    logger.info(f"Container completed for file 1")

        logger.info(f"Run succeeded")
    except:
        logger.error(f"Run failed: {traceback.format_exc()}")
        raise
def main():
    # dask cluster and client
    n_processes = 1
    n_jobs = 35
    n_workers = n_processes * n_jobs

    cluster = SGECluster(
        interface="ib0",
        walltime="01:00:00",
        memory=f"2 G",
        resource_spec=f"h_vmem=2G",
        scheduler_options={
            "dashboard_address": ":5757",
        },
        job_extra=[
            "-cwd",
            "-V",
            f"-pe smp {n_processes}",
            f"-l disk=1G",
        ],
        local_directory=os.sep.join(
            [os.environ.get("PWD"), "dask-worker-space"]),
    )

    client = Client(cluster)

    cluster.scale(jobs=n_jobs)

    time_start = time.time()

    # custom inputs
    if normal:
        matrix_stacked = np.array(
            np.meshgrid(
                np.linspace(
                    0, 1.5, 16
                ),  # 1.5 and 16 for 0.1, 1.5 and 6 for 0.3, 1.4 and 8 for 0.2
                np.linspace(0, 1.5, 16),
                np.linspace(0, 1.5, 16),
                np.linspace(0, 1.5, 16),
                np.linspace(0, 1.5, 16),
            )).T.reshape(-1, 5)
        custom_inputs_set = set(
            tuple(map(float, map("{:.1f}".format, item)))
            for item in matrix_stacked)

        custom_inputs_completed_filenames = glob.glob(
            f"/nobackup/earlacoa/machinelearning/data_annual/predictions/{output}/ds*{output}*"
        )
        custom_inputs_completed_list = []
        for custom_inputs_completed_filename in custom_inputs_completed_filenames:
            custom_inputs_completed_list.append([
                float(item) for item in re.findall(
                    r"\d+\.\d+", custom_inputs_completed_filename)
            ])

        custom_inputs_completed_set = set(
            tuple(item) for item in custom_inputs_completed_list)
        custom_inputs_remaining_set = custom_inputs_set - custom_inputs_completed_set
        custom_inputs = [
            np.array(item).reshape(1, -1)
            for item in custom_inputs_remaining_set
        ]
        print(f"custom inputs remaining for {output}: {len(custom_inputs)}")

    if extra:
        custom_inputs_main = [
            np.array([[1.15, 1.27, 0.98, 0.98, 1.36]]),  # bottom-up 2010
            np.array([[1.19, 1.30, 1.01, 1.01, 1.46]]),  # bottom-up 2011
            np.array([[1.20, 1.30, 1.01, 1.02, 1.39]]),  # bottom-up 2012
            np.array([[1.13, 1.29, 1.02, 1.01, 1.29]]),  # bottom-up 2013
            np.array([[1.06, 1.12, 0.99, 1.01, 1.12]]),  # bottom-up 2014
            np.array([[0.92, 0.84, 0.97, 0.99, 0.94]]),  # bottom-up 2016
            np.array([[0.84, 0.81, 0.99, 0.99, 0.89]]),  # bottom-up 2017
            np.array([[0.76, 0.934, 0.735, 0.683, 0.708]]),
            np.array([[0.704, 0.786, 0.73, 0.659, 0.6]]),
            np.array([[0.712, 0.703, 0.725, 0.676, 0.649]]),
            np.array([[0.739, 0.668, 0.701, 0.686, 0.682]]),
            np.array([[0.67, 0.609, 0.709, 0.621, 0.661]]),
            np.array([[0.744, 0.904, 0.778, 0.678, 0.716]]),
            np.array([[0.771, 0.835, 0.711, 0.685, 0.544]]),
            np.array([[0.647, 0.945, 0.746, 0.588, 0.473]]),
            np.array([[0.657, 0.745, 0.714, 0.613, 0.591]]),
            np.array([[0.582, 0.7, 0.672, 0.5, 0.492]]),
            np.array([[0.803, 0.835, 0.742, 0.71, 0.717]]),
            np.array([[0.721, 0.863, 0.712, 0.74, 0.709]]),
            np.array([[0.661, 0.674, 0.694, 0.742, 0.715]]),
            np.array([[0.701, 0.642, 0.669, 0.681, 0.679]]),
            np.array([[0.604, 0.399, 0.659, 0.613, 0.724]]),
            np.array([[0.769, 1.009, 0.697, 0.69, 0.72]]),
            np.array([[0.824, 0.759, 0.767, 0.641, 0.429]]),
            np.array([[0.858, 1.092, 0.794, 0.604, 0.475]]),
            np.array([[0.8, 0.987, 0.648, 0.57, 0.493]]),
            np.array([[0.867, 0.957, 0.677, 0.558, 0.477]])
        ]
        custom_inputs = []
        for custom_input in custom_inputs_main:
            custom_input_res = np.copy(custom_input)
            custom_input_ind = np.copy(custom_input)
            custom_input_tra = np.copy(custom_input)
            custom_input_agr = np.copy(custom_input)
            custom_input_ene = np.copy(custom_input)
            custom_input_nores = np.copy(custom_input)
            custom_input_noind = np.copy(custom_input)
            custom_input_notra = np.copy(custom_input)
            custom_input_noagr = np.copy(custom_input)
            custom_input_noene = np.copy(custom_input)
            custom_input_resonly = np.copy(custom_input)
            custom_input_indonly = np.copy(custom_input)
            custom_input_traonly = np.copy(custom_input)
            custom_input_agronly = np.copy(custom_input)
            custom_input_eneonly = np.copy(custom_input)

            custom_input_res[0][1:] = 1.0
            custom_input_ind[0][0] = 1.0
            custom_input_ind[0][2:] = 1.0
            custom_input_tra[0][:2] = 1.0
            custom_input_tra[0][3:] = 1.0
            custom_input_agr[0][:3] = 1.0
            custom_input_agr[0][4:] = 1.0
            custom_input_ene[0][:4] = 1.0

            custom_input_nores[0][0] = 0.0
            custom_input_noind[0][1] = 0.0
            custom_input_notra[0][2] = 0.0
            custom_input_noagr[0][3] = 0.0
            custom_input_noene[0][4] = 0.0

            custom_input_resonly[0][1:] = 0.0
            custom_input_indonly[0][0] = 0.0
            custom_input_indonly[0][2:] = 0.0
            custom_input_traonly[0][:2] = 0.0
            custom_input_traonly[0][3:] = 0.0
            custom_input_agronly[0][:3] = 0.0
            custom_input_agronly[0][4:] = 0.0
            custom_input_eneonly[0][:4] = 0.0

            custom_inputs.append(custom_input)
            custom_inputs.append(custom_input_res)
            custom_inputs.append(custom_input_ind)
            custom_inputs.append(custom_input_tra)
            custom_inputs.append(custom_input_agr)
            custom_inputs.append(custom_input_ene)
            custom_inputs.append(custom_input_nores)
            custom_inputs.append(custom_input_noind)
            custom_inputs.append(custom_input_notra)
            custom_inputs.append(custom_input_noagr)
            custom_inputs.append(custom_input_noene)
            custom_inputs.append(custom_input_resonly)
            custom_inputs.append(custom_input_indonly)
            custom_inputs.append(custom_input_traonly)
            custom_inputs.append(custom_input_agronly)
            custom_inputs.append(custom_input_eneonly)

        # just for emulator_predictions.py as this is required in order to adjust for double emissions
        custom_inputs_temp = custom_inputs.copy()
        for custom_input in custom_inputs_temp:
            custom_input_resonly = np.copy(custom_input)
            custom_input_indonly = np.copy(custom_input)
            custom_input_traonly = np.copy(custom_input)
            custom_input_agronly = np.copy(custom_input)
            custom_input_eneonly = np.copy(custom_input)

            custom_input_resonly[0][1:] = 0.0
            custom_input_indonly[0][0] = 0.0
            custom_input_indonly[0][2:] = 0.0
            custom_input_traonly[0][:2] = 0.0
            custom_input_traonly[0][3:] = 0.0
            custom_input_agronly[0][:3] = 0.0
            custom_input_agronly[0][4:] = 0.0
            custom_input_eneonly[0][:4] = 0.0

            custom_inputs.append(custom_input_resonly)
            custom_inputs.append(custom_input_indonly)
            custom_inputs.append(custom_input_traonly)
            custom_inputs.append(custom_input_agronly)
            custom_inputs.append(custom_input_eneonly)

        emission_configs_20percentintervals = []
        for custom_input in custom_inputs:
            emission_config = f'RES{custom_input[0][0]:0.3f}_IND{custom_input[0][1]:0.3f}_TRA{custom_input[0][2]:0.3f}_AGR{custom_input[0][3]:0.3f}_ENE{custom_input[0][4]:0.3f}'
            emission_configs_20percentintervals.append(emission_config)

        emission_configs_20percentintervals = list(
            set(emission_configs_20percentintervals))

        custom_inputs = []
        for emission_config in emission_configs_20percentintervals:
            custom_input = np.array([
                float(num) for num in re.findall(r'\d.\d+', emission_config)
            ]).reshape(1, -1)
            custom_inputs.append(custom_input)

    if climate_cobenefits:
        custom_inputs_main = [
            np.array([[0.91, 0.95, 0.85, 1.05, 0.96]]),  # Base_CLE_2020
            np.array([[0.91, 0.95, 0.85, 1.05, 0.96]]),  # Base_MFR_2020
            np.array([[0.91, 0.95, 0.85, 1.05, 0.96]]),  # SDS_MFR_2020
            np.array([[0.68, 0.84, 0.71, 1.16, 0.93]]),  # Base_CLE_2030
            np.array([[0.33, 0.47, 0.48, 0.81, 0.69]]),  # Base_MFR_2030
            np.array([[0.27, 0.45, 0.41, 0.81, 0.55]]),  # SDS_MFR_2030
            np.array([[0.57, 0.75, 0.69, 1.2, 0.94]]),  # Base_CLE_2040
            np.array([[0.24, 0.41, 0.31, 0.83, 0.73]]),  # Base_MFR_2040
            np.array([[0.19, 0.38, 0.22, 0.83, 0.5]]),  # SDS_MFR_2040
            np.array([[0.52, 0.72, 0.65, 1.24, 0.91]]),  # Base_CLE_2050
            np.array([[0.2, 0.38, 0.29, 0.86, 0.72]]),  # Base_MFR_2050
            np.array([[0.18, 0.35, 0.2, 0.86, 0.46]]),  # SDS_MFR_2050
        ]
        custom_inputs = []
        for custom_input in custom_inputs_main:
            custom_input_res = np.copy(custom_input)
            custom_input_ind = np.copy(custom_input)
            custom_input_tra = np.copy(custom_input)
            custom_input_agr = np.copy(custom_input)
            custom_input_ene = np.copy(custom_input)
            custom_input_nores = np.copy(custom_input)
            custom_input_noind = np.copy(custom_input)
            custom_input_notra = np.copy(custom_input)
            custom_input_noagr = np.copy(custom_input)
            custom_input_noene = np.copy(custom_input)
            custom_input_resonly = np.copy(custom_input)
            custom_input_indonly = np.copy(custom_input)
            custom_input_traonly = np.copy(custom_input)
            custom_input_agronly = np.copy(custom_input)
            custom_input_eneonly = np.copy(custom_input)

            custom_input_res[0][1:] = 1.0
            custom_input_ind[0][0] = 1.0
            custom_input_ind[0][2:] = 1.0
            custom_input_tra[0][:2] = 1.0
            custom_input_tra[0][3:] = 1.0
            custom_input_agr[0][:3] = 1.0
            custom_input_agr[0][4:] = 1.0
            custom_input_ene[0][:4] = 1.0

            custom_input_nores[0][0] = 0.0
            custom_input_noind[0][1] = 0.0
            custom_input_notra[0][2] = 0.0
            custom_input_noagr[0][3] = 0.0
            custom_input_noene[0][4] = 0.0

            custom_input_resonly[0][1:] = 0.0
            custom_input_indonly[0][0] = 0.0
            custom_input_indonly[0][2:] = 0.0
            custom_input_traonly[0][:2] = 0.0
            custom_input_traonly[0][3:] = 0.0
            custom_input_agronly[0][:3] = 0.0
            custom_input_agronly[0][4:] = 0.0
            custom_input_eneonly[0][:4] = 0.0

            custom_inputs.append(custom_input)
            custom_inputs.append(custom_input_res)
            custom_inputs.append(custom_input_ind)
            custom_inputs.append(custom_input_tra)
            custom_inputs.append(custom_input_agr)
            custom_inputs.append(custom_input_ene)
            custom_inputs.append(custom_input_nores)
            custom_inputs.append(custom_input_noind)
            custom_inputs.append(custom_input_notra)
            custom_inputs.append(custom_input_noagr)
            custom_inputs.append(custom_input_noene)
            custom_inputs.append(custom_input_resonly)
            custom_inputs.append(custom_input_indonly)
            custom_inputs.append(custom_input_traonly)
            custom_inputs.append(custom_input_agronly)
            custom_inputs.append(custom_input_eneonly)

        # just for emulator_predictions.py as this is required in order to adjust for double emissions
        custom_inputs_temp = custom_inputs.copy()
        for custom_input in custom_inputs_temp:
            custom_input_resonly = np.copy(custom_input)
            custom_input_indonly = np.copy(custom_input)
            custom_input_traonly = np.copy(custom_input)
            custom_input_agronly = np.copy(custom_input)
            custom_input_eneonly = np.copy(custom_input)

            custom_input_resonly[0][1:] = 0.0
            custom_input_indonly[0][0] = 0.0
            custom_input_indonly[0][2:] = 0.0
            custom_input_traonly[0][:2] = 0.0
            custom_input_traonly[0][3:] = 0.0
            custom_input_agronly[0][:3] = 0.0
            custom_input_agronly[0][4:] = 0.0
            custom_input_eneonly[0][:4] = 0.0

            custom_inputs.append(custom_input_resonly)
            custom_inputs.append(custom_input_indonly)
            custom_inputs.append(custom_input_traonly)
            custom_inputs.append(custom_input_agronly)
            custom_inputs.append(custom_input_eneonly)

        emission_configs_20percentintervals = []
        for custom_input in custom_inputs:
            emission_config = f'RES{custom_input[0][0]:0.3f}_IND{custom_input[0][1]:0.3f}_TRA{custom_input[0][2]:0.3f}_AGR{custom_input[0][3]:0.3f}_ENE{custom_input[0][4]:0.3f}'
            emission_configs_20percentintervals.append(emission_config)

        emission_configs_20percentintervals = list(
            set(emission_configs_20percentintervals))

        custom_inputs = []
        for emission_config in emission_configs_20percentintervals:
            custom_input = np.array([
                float(num) for num in re.findall(r'\d.\d+', emission_config)
            ]).reshape(1, -1)
            custom_inputs.append(custom_input)

    if top_down_2020_baseline:
        emission_config_2020_baseline = np.array(
            [0.604, 0.399, 0.659, 0.613,
             0.724])  # matching to PM2.5 only, top 1,000
        emission_configs = np.array(
            np.meshgrid(
                np.linspace(
                    emission_config_2020_baseline[0] * 0.50,
                    emission_config_2020_baseline[0], 6
                ),  # 10% reduction increments from 2020 baseline up to 50%
                np.linspace(emission_config_2020_baseline[1] * 0.50,
                            emission_config_2020_baseline[1], 6),
                np.linspace(emission_config_2020_baseline[2] * 0.50,
                            emission_config_2020_baseline[2], 6),
                np.linspace(emission_config_2020_baseline[3] * 0.50,
                            emission_config_2020_baseline[3], 6),
                np.linspace(emission_config_2020_baseline[4] * 0.50,
                            emission_config_2020_baseline[4], 6),
            )).T.reshape(-1, 5)
        custom_inputs = [
            np.array(item).reshape(1, -1) for item in emission_configs
        ]
        # add a couple more for larger reductions in RES and IND to reach WHO-IT2
        custom_inputs.append(np.array([[0.242, 0.160, 0.659, 0.613, 0.724]]))
        custom_inputs.append(np.array([[0.181, 0.120, 0.659, 0.613, 0.724]]))
        custom_inputs.append(np.array([[0.121, 0.080, 0.659, 0.613, 0.724]]))
        custom_inputs.append(np.array([[0.060, 0.040, 0.659, 0.613, 0.724]]))

        # just for emulator_predictions.py as this is required in order to adjust for double emissions
        custom_inputs_temp = custom_inputs.copy()
        for custom_input in custom_inputs_temp:
            custom_input_resonly = np.copy(custom_input)
            custom_input_indonly = np.copy(custom_input)
            custom_input_traonly = np.copy(custom_input)
            custom_input_agronly = np.copy(custom_input)
            custom_input_eneonly = np.copy(custom_input)
            custom_input_resonly[0][1:] = 0.0
            custom_input_indonly[0][0] = 0.0
            custom_input_indonly[0][2:] = 0.0
            custom_input_traonly[0][:2] = 0.0
            custom_input_traonly[0][3:] = 0.0
            custom_input_agronly[0][:3] = 0.0
            custom_input_agronly[0][4:] = 0.0
            custom_input_eneonly[0][:4] = 0.0
            custom_inputs.append(custom_input_resonly)
            custom_inputs.append(custom_input_indonly)
            custom_inputs.append(custom_input_traonly)
            custom_inputs.append(custom_input_agronly)
            custom_inputs.append(custom_input_eneonly)

        emission_configs_20percentintervals = []
        for custom_input in custom_inputs:
            emission_config = f'RES{custom_input[0][0]:0.3f}_IND{custom_input[0][1]:0.3f}_TRA{custom_input[0][2]:0.3f}_AGR{custom_input[0][3]:0.3f}_ENE{custom_input[0][4]:0.3f}'
            emission_configs_20percentintervals.append(emission_config)

        emission_configs_20percentintervals = set(
            emission_configs_20percentintervals)

        custom_inputs_completed_filenames = glob.glob(
            f"/nobackup/earlacoa/machinelearning/data_annual/predictions/{output}/ds*{output}.nc"
        )
        custom_inputs_completed_list = []
        for custom_inputs_completed_filename in custom_inputs_completed_filenames:
            emission_config = re.findall(
                r"RES\d+\.\d+_IND\d+\.\d+_TRA\d+\.\d+_AGR\d+\.\d+_ENE\d+\.\d+",
                custom_inputs_completed_filename)
            if len(emission_config) > 0:
                custom_inputs_completed_list.append(emission_config)

        custom_inputs_completed_set = set(
            item[0] for item in custom_inputs_completed_list)
        custom_inputs_remaining_set = emission_configs_20percentintervals - custom_inputs_completed_set
        custom_inputs = [
            np.array([float(n)
                      for n in re.findall(r'\d+.\d+', item)]).reshape(1, -1)
            for item in custom_inputs_remaining_set
        ]

    # dask bag and process
    custom_inputs = custom_inputs[:5000]
    #custom_inputs = custom_inputs[5000:]

    print(f"predicting for {len(custom_inputs)} custom inputs ...")
    bag_custom_inputs = db.from_sequence(custom_inputs, npartitions=n_workers)
    bag_custom_inputs.map(custom_predicts).compute()

    time_end = time.time() - time_start
    print(
        f"completed in {time_end:0.2f} seconds, or {time_end / 60:0.2f} minutes, or {time_end / 3600:0.2f} hours"
    )
    print(
        f"average time per custom input is {time_end / len(custom_inputs):0.2f} seconds"
    )

    client.close()
    cluster.close()
Example #15
0
def main():
    # dask cluster and client
    if output == 'PM2_5_DRY':
        n_jobs = 20
        n_outputs = 1000
    elif output == 'o3_6mDM8h':
        n_jobs = 20
        n_outputs = 2000

    n_processes = 1
    n_workers = n_processes * n_jobs

    cluster = SGECluster(
        interface="ib0",
        walltime="02:00:00",
        memory=f"48 G",
        resource_spec=f"h_vmem=48G",
        scheduler_options={
            "dashboard_address": ":7777",
        },
        job_extra=[
            "-cwd",
            "-V",
            f"-pe smp {n_processes}",
            f"-l disk=48G",
        ],
        local_directory=os.sep.join(
            [os.environ.get("PWD"), "dask-hia-ozone-space"]),
    )

    client = Client(cluster)

    cluster.scale(jobs=n_jobs)

    time_start = time.time()

    # find remaining inputs
    if normal:
        custom_outputs = glob.glob(
            f"/nobackup/earlacoa/machinelearning/data_annual/predictions/{output}_adjusted_scaled/ds*{output}_popgrid_0.25deg_adjusted_scaled.nc"
        )
        custom_outputs_completed = glob.glob(
            f"/nobackup/earlacoa/machinelearning/data_annual/health_impact_assessments/{output}_adjusted_scaled/df_country_hia_*.csv"
        )
        custom_outputs_remaining_set = set([
            item.split("/")[-1][3:-1 - len(output) - 19 - 7]
            for item in custom_outputs
        ]) - set([
            item.split("/")[-1][15 + len(output) + 1:-4 - 7]
            for item in custom_outputs_completed
        ])
        custom_outputs_remaining = [
            item for item in custom_outputs_remaining_set
        ]
        print(
            f"custom outputs remaining for {output}: {len(custom_outputs_remaining)} - 10% intervals with {int(100 * len(custom_outputs_remaining_set) / 16**5)}% remaining"
        )

        reduce_to_20percent_intervals = True
        if reduce_to_20percent_intervals:
            emission_configs = np.array(
                np.meshgrid(
                    np.linspace(0.0, 1.4, 8),
                    np.linspace(0.0, 1.4, 8),
                    np.linspace(0.0, 1.4, 8),
                    np.linspace(0.0, 1.4, 8),
                    np.linspace(0.0, 1.4, 8),
                )).T.reshape(-1, 5)
            emission_configs_20percentintervals = []
            for emission_config in emission_configs:
                emission_configs_20percentintervals.append(
                    f'RES{round(emission_config[0], 1)}_IND{round(emission_config[1], 1)}_TRA{round(emission_config[2], 1)}_AGR{round(emission_config[3], 1)}_ENE{round(emission_config[4], 1)}'
                )

            emission_configs_completed = []
            for custom_output_completed in custom_outputs_completed:
                emission_configs_completed.append(
                    re.findall(
                        r'RES\d+.\d+_IND\d+.\d+_TRA\d+.\d+_AGR\d+.\d+_ENE\d+.\d+',
                        custom_output_completed)[0])

            emission_configs_20percentintervals_remaining_set = set(
                emission_configs_20percentintervals) - set(
                    emission_configs_completed)
            custom_outputs_remaining = [
                item
                for item in emission_configs_20percentintervals_remaining_set
            ]
            print(
                f"custom outputs remaining for {output}: {len(custom_outputs_remaining)} - 20% intervals with {int(100 * len(emission_configs_20percentintervals_remaining_set) / len(emission_configs_20percentintervals))}% remaining"
            )

    if extra:
        if year == '2010':
            custom_inputs_main = [
                np.array([[1.15, 1.27, 0.98, 0.98, 1.36]]),  # bottom-up 2010
            ]
        elif year == '2011':
            custom_inputs_main = [
                np.array([[1.19, 1.30, 1.01, 1.01, 1.46]]),  # bottom-up 2011
            ]
        elif year == '2012':
            custom_inputs_main = [
                np.array([[1.20, 1.30, 1.01, 1.02, 1.39]]),  # bottom-up 2012
            ]
        elif year == '2013':
            custom_inputs_main = [
                np.array([[1.13, 1.29, 1.02, 1.01, 1.29]]),  # bottom-up 2013
            ]
        elif year == '2014':
            custom_inputs_main = [
                np.array([[1.06, 1.12, 0.99, 1.01, 1.12]]),  # bottom-up 2014
            ]
        elif year == '2015':
            custom_inputs_main = [
                np.array([[1.0, 1.0, 1.0, 1.0, 1.0]]),  # control
            ]
        elif year == '2016':
            custom_inputs_main = [
                np.array([[0.92, 0.84, 0.97, 0.99, 0.94]]),  # bottom-up 2016
                np.array([[0.76, 0.934, 0.735, 0.683,
                           0.708]]),  # top-down 2016 - both
                np.array([[0.744, 0.904, 0.778, 0.678,
                           0.716]]),  # top-down 2016 - either
                np.array([[0.803, 0.835, 0.742, 0.71,
                           0.717]]),  # top-down 2016 - pm25 only
                np.array([[0.769, 1.009, 0.697, 0.69,
                           0.72]]),  # top-down 2016 - o3 only
            ]
        elif year == '2017':
            custom_inputs_main = [
                np.array([[0.84, 0.81, 0.99, 0.99, 0.89]]),  # bottom-up 2017
                np.array([[0.704, 0.786, 0.73, 0.659,
                           0.6]]),  # top-down 2017 - both
                np.array([[0.771, 0.835, 0.711, 0.685,
                           0.544]]),  # top-down 2017 - either
                np.array([[0.721, 0.863, 0.712, 0.74,
                           0.709]]),  # top-down 2017 - pm25 only
                np.array([[0.824, 0.759, 0.767, 0.641,
                           0.429]]),  # top-down 2017 - o3 only
            ]
        elif year == '2018':
            custom_inputs_main = [
                np.array([[0.712, 0.703, 0.725, 0.676,
                           0.649]]),  # top-down 2018 - both
                np.array([[0.647, 0.945, 0.746, 0.588,
                           0.473]]),  # top-down 2018 - either
                np.array([[0.661, 0.674, 0.694, 0.742,
                           0.715]]),  # top-down 2018 - pm25 only
                np.array([[0.858, 1.092, 0.794, 0.604,
                           0.475]]),  # top-down 2018 - o3 only
            ]
        elif year == '2019':
            custom_inputs_main = [
                np.array([[0.739, 0.668, 0.701, 0.686,
                           0.682]]),  # top-down 2019 - both
                np.array([[0.657, 0.745, 0.714, 0.613,
                           0.591]]),  # top-down 2019 - either
                np.array([[0.701, 0.642, 0.669, 0.681,
                           0.679]]),  # top-down 2019 - pm25 only
                np.array([[0.8, 0.987, 0.648, 0.57,
                           0.493]]),  # top-down 2019 - o3 only
            ]
        elif year == '2020':
            custom_inputs_main = [
                np.array([[0.67, 0.609, 0.709, 0.621,
                           0.661]]),  # top-down 2020 - both
                np.array([[0.582, 0.7, 0.672, 0.5,
                           0.492]]),  # top-down 2020 - either
                np.array([[0.604, 0.399, 0.659, 0.613,
                           0.724]]),  # top-down 2020 - pm25 only
                np.array([[0.867, 0.957, 0.677, 0.558,
                           0.477]]),  # top-down 2020 - o3 only
            ]
        custom_inputs = []
        for custom_input in custom_inputs_main:
            custom_input_res = np.copy(custom_input)
            custom_input_ind = np.copy(custom_input)
            custom_input_tra = np.copy(custom_input)
            custom_input_agr = np.copy(custom_input)
            custom_input_ene = np.copy(custom_input)
            custom_input_nores = np.copy(custom_input)
            custom_input_noind = np.copy(custom_input)
            custom_input_notra = np.copy(custom_input)
            custom_input_noagr = np.copy(custom_input)
            custom_input_noene = np.copy(custom_input)

            custom_input_res[0][1:] = 1.0
            custom_input_ind[0][0] = 1.0
            custom_input_ind[0][2:] = 1.0
            custom_input_tra[0][:2] = 1.0
            custom_input_tra[0][3:] = 1.0
            custom_input_agr[0][:3] = 1.0
            custom_input_agr[0][4:] = 1.0
            custom_input_ene[0][:4] = 1.0

            custom_input_nores[0][0] = 0.0
            custom_input_noind[0][1] = 0.0
            custom_input_notra[0][2] = 0.0
            custom_input_noagr[0][3] = 0.0
            custom_input_noene[0][4] = 0.0

            custom_inputs.append(custom_input)
            custom_inputs.append(custom_input_res)
            custom_inputs.append(custom_input_ind)
            custom_inputs.append(custom_input_tra)
            custom_inputs.append(custom_input_agr)
            custom_inputs.append(custom_input_ene)
            custom_inputs.append(custom_input_nores)
            custom_inputs.append(custom_input_noind)
            custom_inputs.append(custom_input_notra)
            custom_inputs.append(custom_input_noagr)
            custom_inputs.append(custom_input_noene)

        custom_outputs_remaining = []
        for custom_input in custom_inputs:
            emission_config = f'RES{custom_input[0][0]:0.3f}_IND{custom_input[0][1]:0.3f}_TRA{custom_input[0][2]:0.3f}_AGR{custom_input[0][3]:0.3f}_ENE{custom_input[0][4]:0.3f}'
            custom_outputs_remaining.append(emission_config)

    if climate_cobenefits:
        custom_inputs_main = [
            np.array([[0.91, 0.95, 0.85, 1.05, 0.96]]),  # Base_CLE_2020
            np.array([[0.91, 0.95, 0.85, 1.05, 0.96]]),  # Base_MFR_2020
            np.array([[0.91, 0.95, 0.85, 1.05, 0.96]]),  # SDS_MFR_2020
            np.array([[0.68, 0.84, 0.71, 1.16, 0.93]]),  # Base_CLE_2030
            np.array([[0.33, 0.47, 0.48, 0.81, 0.69]]),  # Base_MFR_2030
            np.array([[0.27, 0.45, 0.41, 0.81, 0.55]]),  # SDS_MFR_2030
            np.array([[0.57, 0.75, 0.69, 1.2, 0.94]]),  # Base_CLE_2040
            np.array([[0.24, 0.41, 0.31, 0.83, 0.73]]),  # Base_MFR_2040
            np.array([[0.19, 0.38, 0.22, 0.83, 0.5]]),  # SDS_MFR_2040
            np.array([[0.52, 0.72, 0.65, 1.24, 0.91]]),  # Base_CLE_2050
            np.array([[0.2, 0.38, 0.29, 0.86, 0.72]]),  # Base_MFR_2050
            np.array([[0.18, 0.35, 0.2, 0.86, 0.46]]),  # SDS_MFR_2050
        ]
        custom_inputs = []
        for custom_input in custom_inputs_main:
            custom_input_res = np.copy(custom_input)
            custom_input_ind = np.copy(custom_input)
            custom_input_tra = np.copy(custom_input)
            custom_input_agr = np.copy(custom_input)
            custom_input_ene = np.copy(custom_input)
            custom_input_nores = np.copy(custom_input)
            custom_input_noind = np.copy(custom_input)
            custom_input_notra = np.copy(custom_input)
            custom_input_noagr = np.copy(custom_input)
            custom_input_noene = np.copy(custom_input)
            custom_input_resonly = np.copy(custom_input)
            custom_input_indonly = np.copy(custom_input)
            custom_input_traonly = np.copy(custom_input)
            custom_input_agronly = np.copy(custom_input)
            custom_input_eneonly = np.copy(custom_input)

            custom_input_res[0][1:] = 1.0
            custom_input_ind[0][0] = 1.0
            custom_input_ind[0][2:] = 1.0
            custom_input_tra[0][:2] = 1.0
            custom_input_tra[0][3:] = 1.0
            custom_input_agr[0][:3] = 1.0
            custom_input_agr[0][4:] = 1.0
            custom_input_ene[0][:4] = 1.0

            custom_input_nores[0][0] = 0.0
            custom_input_noind[0][1] = 0.0
            custom_input_notra[0][2] = 0.0
            custom_input_noagr[0][3] = 0.0
            custom_input_noene[0][4] = 0.0

            custom_input_resonly[0][1:] = 0.0
            custom_input_indonly[0][0] = 0.0
            custom_input_indonly[0][2:] = 0.0
            custom_input_traonly[0][:2] = 0.0
            custom_input_traonly[0][3:] = 0.0
            custom_input_agronly[0][:3] = 0.0
            custom_input_agronly[0][4:] = 0.0
            custom_input_eneonly[0][:4] = 0.0

            custom_inputs.append(custom_input)
            custom_inputs.append(custom_input_res)
            custom_inputs.append(custom_input_ind)
            custom_inputs.append(custom_input_tra)
            custom_inputs.append(custom_input_agr)
            custom_inputs.append(custom_input_ene)
            custom_inputs.append(custom_input_nores)
            custom_inputs.append(custom_input_noind)
            custom_inputs.append(custom_input_notra)
            custom_inputs.append(custom_input_noagr)
            custom_inputs.append(custom_input_noene)
            custom_inputs.append(custom_input_resonly)
            custom_inputs.append(custom_input_indonly)
            custom_inputs.append(custom_input_traonly)
            custom_inputs.append(custom_input_agronly)
            custom_inputs.append(custom_input_eneonly)

        custom_outputs_remaining = []
        for custom_input in custom_inputs:
            emission_config = f'RES{custom_input[0][0]:0.3f}_IND{custom_input[0][1]:0.3f}_TRA{custom_input[0][2]:0.3f}_AGR{custom_input[0][3]:0.3f}_ENE{custom_input[0][4]:0.3f}'
            custom_outputs_remaining.append(emission_config)

    if top_down_2020_baseline:
        emission_config_2020_baseline = np.array(
            [0.604, 0.399, 0.659, 0.613,
             0.724])  # matching to PM2.5 only, top 1,000
        emission_configs = np.array(
            np.meshgrid(
                np.linspace(
                    emission_config_2020_baseline[0] * 0.50,
                    emission_config_2020_baseline[0], 6
                ),  # 10% reduction increments from 2020 baseline up to 50%
                np.linspace(emission_config_2020_baseline[1] * 0.50,
                            emission_config_2020_baseline[1], 6),
                np.linspace(emission_config_2020_baseline[2] * 0.50,
                            emission_config_2020_baseline[2], 6),
                np.linspace(emission_config_2020_baseline[3] * 0.50,
                            emission_config_2020_baseline[3], 6),
                np.linspace(emission_config_2020_baseline[4] * 0.50,
                            emission_config_2020_baseline[4], 6),
            )).T.reshape(-1, 5)
        # add a couple more for larger reductions in RES and IND to reach WHO-IT2
        emission_configs = list(emission_configs)
        emission_configs.append(np.array([[0.242, 0.160, 0.659, 0.613,
                                           0.724]]))
        emission_configs.append(np.array([[0.181, 0.120, 0.659, 0.613,
                                           0.724]]))
        emission_configs.append(np.array([[0.121, 0.080, 0.659, 0.613,
                                           0.724]]))
        emission_configs.append(np.array([[0.060, 0.040, 0.659, 0.613,
                                           0.724]]))

        emission_configs_total = []
        for emission_config in emission_configs:
            emission_configs_total.append(
                f'RES{round(emission_config[0], 3):.3f}_IND{round(emission_config[1], 3):.3f}_TRA{round(emission_config[2], 3):.3f}_AGR{round(emission_config[3], 3):.3f}_ENE{round(emission_config[4], 3):.3f}'
            )

        custom_outputs_completed = glob.glob(
            f"/nobackup/earlacoa/machinelearning/data_annual/health_impact_assessments/{output}_adjusted_scaled/df_country_hia_*.csv"
        )
        emission_configs_completed = []
        for custom_output_completed in custom_outputs_completed:
            emission_configs_completed.append(
                re.findall(
                    r'RES\d+.\d+_IND\d+.\d+_TRA\d+.\d+_AGR\d+.\d+_ENE\d+.\d+',
                    custom_output_completed)[0])

        emission_configs_remaining_set = set(emission_configs_total) - set(
            emission_configs_completed)
        custom_outputs_remaining = [
            item for item in emission_configs_remaining_set
        ]
        print(
            f"custom outputs remaining: {len(custom_outputs_remaining)}, {int(100 * len(emission_configs_remaining_set) / len(emission_configs_total))}%"
        )

    # --------------------------------------------------

    # dask bag and process
    # run in 10 chunks over 10 cores, each chunk taking 2 minutes
    custom_outputs_remaining = custom_outputs_remaining[0:n_outputs]
    print(f"predicting for {len(custom_outputs_remaining)} custom outputs ...")
    bag_custom_outputs = db.from_sequence(custom_outputs_remaining,
                                          npartitions=n_workers)
    if output == "PM2_5_DRY":
        bag_custom_outputs.map(health_impact_assessment_pm25).compute()
    elif output == "o3_6mDM8h":
        bag_custom_outputs.map(health_impact_assessment_o3).compute()

    time_end = time.time() - time_start
    print(
        f"completed in {time_end:0.2f} seconds, or {time_end / 60:0.2f} minutes, or {time_end / 3600:0.2f} hours"
    )

    client.close()
    cluster.close()
Example #16
0
def main():
    # dask cluster and client
    n_processes = 1
    n_jobs = 35
    n_workers = n_processes * n_jobs

    cluster = SGECluster(
        interface="ib0",
        walltime="01:00:00",
        memory=f"64 G",
        resource_spec=f"h_vmem=64G",
        scheduler_options={
            "dashboard_address": ":5757",
        },
        job_extra=[
            "-cwd",
            "-V",
            f"-pe smp {n_processes}",
            f"-l disk=32G",
        ],
        local_directory=os.sep.join([os.environ.get("PWD"), "dask-worker-space"]),
    )

    client = Client(cluster)

    cluster.scale(jobs=n_jobs)

    time_start = time.time()

    # regrid custom outputs to pop grid
    custom_outputs = glob.glob(
        f"/nobackup/earlacoa/machinelearning/data_annual/predictions/{output}/ds*{output}.nc"
    )
    custom_outputs_completed = glob.glob(
        f"/nobackup/earlacoa/machinelearning/data_annual/predictions/{output}/ds*{output}_popgrid_0.25deg.nc"
    )
    custom_outputs_completed = [
        f"{item[0:-19]}.nc" for item in custom_outputs_completed
    ]
    custom_outputs_remaining_set = set(custom_outputs) - set(custom_outputs_completed)
    custom_outputs_remaining = [item for item in custom_outputs_remaining_set]
    print(f"custom outputs remaining for {output}: {len(custom_outputs_remaining)}")

    # dask bag and process
    custom_outputs_remaining = custom_outputs_remaining[
        0:5000
    ]  # run in 5,000 chunks over 30 cores, each chunk taking 2 minutes
    print(f"predicting for {len(custom_outputs_remaining)} custom outputs ...")
    bag_custom_outputs = db.from_sequence(
        custom_outputs_remaining, npartitions=n_workers
    )
    bag_custom_outputs.map(regrid_to_pop).compute()

    time_end = time.time() - time_start
    print(
        f"completed in {time_end:0.2f} seconds, or {time_end / 60:0.2f} minutes, or {time_end / 3600:0.2f} hours"
    )
    print(
        f"average time per custom output is {time_end / len(custom_outputs_remaining):0.2f} seconds"
    )

    client.close()
    cluster.close()
Example #17
0
    def _init_dask(self):
        """
        Starts a dask cluster, according to the cluster type specified in the constructor.
        Sets self.client.
        Also writes useful URLs to graph-links.txt.

        If the 'cluster-type' is 'synchronous', then the cluster will be
        a special stub class (DebugCluster), which provides dummy
        implementations of a few functions from the DistributedCluster API.
        (Mostly just for convenient unit testing.)
        """

        # Consider using client.register_worker_callbacks() to configure
        # - faulthandler (later)
        # - excepthook?
        # - (okay, maybe it's just best to put that stuff in __init__.py, like in DSS)

        load_and_overwrite_dask_config(self.cluster_type, 'dask-config.yaml',
                                       True)
        self._write_driver_graph_urls()

        if self.cluster_type in JOBQUEUE_CLUSTERS:
            update_jobqueue_config_with_defaults(self.cluster_type)

            if self.cluster_type == "lsf":
                from dask_jobqueue import LSFCluster
                cluster = LSFCluster()  #ip='0.0.0.0')
            elif self.cluster_type == "sge":
                from dask_jobqueue import SGECluster
                cluster = SGECluster(ip='0.0.0.0')
            elif self.cluster_type == "slurm":
                from dask_jobqueue import SLURMCluster
                cluster = SLURMCluster(ip='0.0.0.0')
            else:
                raise AssertionError("Unimplemented jobqueue cluster")

            cluster.scale(self.num_workers)

        elif self.cluster_type == "local-cluster":
            cluster = LocalCluster(self.num_workers,
                                   threads_per_worker=1,
                                   processes=True,
                                   ip='0.0.0.0')

        elif self.cluster_type in ("synchronous", "processes"):
            cluster = None
            # synchronous/processes mode is for testing and debugging only
            assert dask.config.get('scheduler', self.cluster_type) == self.cluster_type, \
                "Inconsistency between the dask-config and the scheduler you chose."

            dask.config.set(scheduler=self.cluster_type)
            self.client = DebugClient(self.cluster_type)
        else:
            raise AssertionError("Unknown cluster type")

        dump_dask_config('full-dask-config.yaml')

        if cluster:
            dashboard = cluster.dashboard_link
            logger.info(f"Dashboard running on {dashboard}")
            dashboard_ip = extract_ip_from_link(dashboard)
            dashboard = dashboard.replace(dashboard_ip, socket.gethostname())
            logger.info(f"              a.k.a. {dashboard}")

            # Note: Overrides config value: distributed.comm.timeouts.connect
            self.client = Client(cluster, timeout='60s')

            # Wait for the workers to spin up.
            with Timer(f"Waiting for {self.num_workers} workers to launch",
                       logger) as wait_timer:
                while (self.wait_for_workers
                       and self.client.status == "running"
                       and len(self.client.cluster.scheduler.workers) <
                       self.num_workers):

                    if wait_timer.seconds > (60 * self.cluster_max_wait):
                        msg = (
                            f"Not all cluster workers could be launched within the "
                            "allotted time ({self.cluster_max_wait} minutes).\n"
                            "Try again or adjust the 'cluster-max-wait' setting.\n"
                        )
                        raise RuntimeError(msg)
                    time.sleep(0.1)

            if self.wait_for_workers and self.cluster_type == "lsf":
                self._write_worker_graph_urls('graph-links.txt')
def main():
    # dask cluster and client
    n_processes = 1
    n_workers = n_processes * n_jobs

    cluster = SGECluster(
        interface="ib0",
        walltime=walltime,
        memory=f"32 G",
        resource_spec=f"h_vmem=32G",
        scheduler_options={
            "dashboard_address": ":5761",
        },
        job_extra=[
            "-cwd",
            "-V",
            f"-pe smp {n_processes}",
            f"-l disk=32G",
        ],
        local_directory=os.sep.join(
            [os.environ.get("PWD"), "dask-find-emis-pm-space"]),
    )

    client = Client(cluster)

    cluster.scale(jobs=n_jobs)

    time_start = time.time()

    # dask bag over emission_configs
    print(
        f"predicting over {len(emission_configs)} emission configs for {station_id} ..."
    )
    bag_emission_configs = db.from_sequence(emission_configs,
                                            npartitions=n_workers)
    results = bag_emission_configs.map(filter_emission_configs).compute()

    station_diffs_abs = [result[0] for result in results]
    station_diffs_per = [result[1] for result in results]
    key = [key for key in baselines.keys()][0]
    station_diffs_abs = [
        station_diff_abs for station_diff_abs in station_diffs_abs
        if len(station_diff_abs[key]) > 0
    ]
    station_diffs_per = [
        station_diff_per for station_diff_per in station_diffs_per
        if len(station_diff_per[key]) > 0
    ]

    merged_per = {}
    for station_diff_per in station_diffs_per:
        merged_per = {**merged_per, **station_diff_per[key]}

    merged_abs = {}
    for station_diff_abs in station_diffs_abs:
        merged_abs = {**merged_abs, **station_diff_abs[key]}

    station_diffs_per = {key: merged_per}
    station_diffs_abs = {key: merged_abs}

    joblib.dump(
        obs_change_abs,
        f"/nobackup/earlacoa/machinelearning/data_annual/find_emissions_that_match_change_air_quality/{sub_folder}_adjusted_scaled/obs_change_abs_{output}_{station_id}.joblib"
    )
    joblib.dump(
        obs_change_per,
        f"/nobackup/earlacoa/machinelearning/data_annual/find_emissions_that_match_change_air_quality/{sub_folder}_adjusted_scaled/obs_change_per_{output}_{station_id}.joblib"
    )
    joblib.dump(
        baselines,
        f"/nobackup/earlacoa/machinelearning/data_annual/find_emissions_that_match_change_air_quality/{sub_folder}_adjusted_scaled/baselines_{output}_{station_id}.joblib"
    )
    joblib.dump(
        targets,
        f"/nobackup/earlacoa/machinelearning/data_annual/find_emissions_that_match_change_air_quality/{sub_folder}_adjusted_scaled/targets_{output}_{station_id}.joblib"
    )
    joblib.dump(
        target_diffs,
        f"/nobackup/earlacoa/machinelearning/data_annual/find_emissions_that_match_change_air_quality/{sub_folder}_adjusted_scaled/target_diffs_{output}_{station_id}.joblib"
    )
    joblib.dump(
        station_diffs_abs,
        f"/nobackup/earlacoa/machinelearning/data_annual/find_emissions_that_match_change_air_quality/{sub_folder}_adjusted_scaled/station_diffs_abs_{output}_{station_id}.joblib"
    )
    joblib.dump(
        station_diffs_per,
        f"/nobackup/earlacoa/machinelearning/data_annual/find_emissions_that_match_change_air_quality/{sub_folder}_adjusted_scaled/station_diffs_per_{output}_{station_id}.joblib"
    )

    time_end = time.time() - time_start
    print(
        f"completed in {time_end:0.2f} seconds, or {time_end / 60:0.2f} minutes, or {time_end / 3600:0.2f} hours"
    )

    client.close()
    cluster.close()
Example #19
0
def main():
    # dask cluster and client
    n_processes = 1
    n_jobs = 35
    n_workers = n_processes * n_jobs

    cluster = SGECluster(interface='ib0',
                         walltime='01:00:00',
                         memory=f'2 G',
                         resource_spec=f'h_vmem=2G',
                         scheduler_options={
                             'dashboard_address': ':5757',
                         },
                         project='admiralty',
                         job_extra=[
                             '-cwd',
                             '-V',
                             f'-pe smp {n_processes}',
                             f'-l disk=1G',
                         ],
                         local_directory=os.sep.join(
                             [os.environ.get('PWD'), 'dask-worker-space']))

    client = Client(cluster)

    cluster.scale(jobs=n_jobs)

    time_start = time.time()

    # custom inputs
    matrix_stacked = np.array(
        np.meshgrid(
            np.linspace(
                0, 1.5, 16
            ),  # 1.5 and 16 for 0.1, 1.5 and 6 for 0.3, 1.4 and 8 for 0.2
            np.linspace(0, 1.5, 16),
            np.linspace(0, 1.5, 16),
            np.linspace(0, 1.5, 16),
            np.linspace(0, 1.5, 16))).T.reshape(-1, 5)
    custom_inputs_set = set(
        tuple(map(float, map("{:.1f}".format, item)))
        for item in matrix_stacked)

    custom_inputs_completed_filenames = glob.glob(
        '/nobackup/earlacoa/machinelearning/data/summary/ds*' + output + '*')
    custom_inputs_completed_list = []
    for custom_inputs_completed_filename in custom_inputs_completed_filenames:
        custom_inputs_completed_list.append([
            float(item) for item in re.findall(
                r'\d+\.\d+', custom_inputs_completed_filename)
        ])

    custom_inputs_completed_set = set(
        tuple(item) for item in custom_inputs_completed_list)
    custom_inputs_remaining_set = custom_inputs_set - custom_inputs_completed_set
    custom_inputs = [
        np.array(item).reshape(1, -1) for item in custom_inputs_remaining_set
    ]
    print(f'custom inputs remaining for {output}: {len(custom_inputs)}')

    # dask bag and process
    custom_inputs = custom_inputs[
        0:5000]  # run in 1,000 chunks over 30 cores, each chunk taking 1 hour
    print(f'predicting for {len(custom_inputs)} custom inputs ...')
    bag_custom_inputs = db.from_sequence(custom_inputs, npartitions=n_workers)
    bag_custom_inputs.map(custom_predicts).compute()

    time_end = time.time() - time_start
    print(
        f'completed in {time_end:0.2f} seconds, or {time_end / 60:0.2f} minutes, or {time_end / 3600:0.2f} hours'
    )
    print(
        f'average time per custom input is {time_end / len(custom_inputs):0.2f} seconds'
    )

    client.close()
    cluster.close()
Example #20
0
def main():

    start_logging()

    #------------------------------------------
    # Create input and output dataset
    #------------------------------------------
    IO = IOClass()
    DATA = IO.create_data_file()

    # Create global result and restart datasets
    RESULT = IO.create_result_file()
    RESTART = IO.create_restart_file()

    #----------------------------------------------
    # Calculation - Multithreading using all cores
    #----------------------------------------------

    # Auxiliary variables for futures
    futures = []

    # Measure time
    start_time = datetime.now()

    #-----------------------------------------------
    # Create a client for distributed calculations
    #-----------------------------------------------
    if (slurm_use):

        with SLURMCluster(scheduler_port=port,
                          cores=cores,
                          processes=processes,
                          memory=memory,
                          shebang=shebang,
                          name=name,
                          job_extra=slurm_parameters,
                          local_directory='logs/dask-worker-space') as cluster:
            cluster.scale(processes * nodes)
            print(cluster.job_script())
            print("You are using SLURM!\n")
            print(cluster)
            run_cosipy(cluster, IO, DATA, RESULT, RESTART, futures)

    elif sge_use:

        with SGECluster(interface=interface,
                        cores=cores,
                        processes=processes,
                        memory=memory,
                        shebang=shebang,
                        walltime=walltime,
                        resource_spec=resource_spec,
                        job_extra=sge_parameters,
                        local_directory='logs/dask-worker-space') as cluster:
            cluster.scale(cores=total_cores)
            print(cluster.job_script())
            print("You are using SGE!\n")
            print(cluster)
            run_cosipy(cluster, IO, DATA, RESULT, RESTART, futures)

    else:
        with LocalCluster(scheduler_port=local_port,
                          n_workers=workers,
                          threads_per_worker=1,
                          silence_logs=True) as cluster:
            print(cluster)
            run_cosipy(cluster, IO, DATA, RESULT, RESTART, futures)

    print('\n')
    print('--------------------------------------------------------------')
    print('Write results ...')
    print('-------------------------------------------------------------- \n')
    start_writing = datetime.now()

    #-----------------------------------------------
    # Write results and restart files
    #-----------------------------------------------
    timestamp = pd.to_datetime(str(
        IO.get_restart().time.values)).strftime('%Y-%m-%dT%H-%M')

    encoding = dict()
    for var in IO.get_result().data_vars:
        dataMin = IO.get_result()[var].min(skipna=True).values
        dataMax = IO.get_result()[var].max(skipna=True).values
        dtype = 'int16'
        FillValue = -9999
        scale_factor, add_offset = compute_scale_and_offset(
            dataMin, dataMax, 16)
        #encoding[var] = dict(zlib=True, complevel=compression_level, dtype=dtype, scale_factor=scale_factor, add_offset=add_offset, _FillValue=FillValue)
        encoding[var] = dict(zlib=True, complevel=compression_level)

    IO.get_result().to_netcdf(os.path.join(data_path, 'output', output_netcdf),
                              encoding=encoding,
                              mode='w')

    encoding = dict()
    for var in IO.get_restart().data_vars:
        dataMin = IO.get_restart()[var].min(skipna=True).values
        dataMax = IO.get_restart()[var].max(skipna=True).values
        dtype = 'int16'
        FillValue = -9999
        scale_factor, add_offset = compute_scale_and_offset(
            dataMin, dataMax, 16)
        #encoding[var] = dict(zlib=True, complevel=compression_level, dtype=dtype, scale_factor=scale_factor, add_offset=add_offset, _FillValue=FillValue)
        encoding[var] = dict(zlib=True, complevel=compression_level)

    IO.get_restart().to_netcdf(os.path.join(data_path, 'restart',
                                            'restart_' + timestamp + '.nc'),
                               encoding=encoding)

    #-----------------------------------------------
    # Stop time measurement
    #-----------------------------------------------
    duration_run = datetime.now() - start_time
    duration_run_writing = datetime.now() - start_writing

    #-----------------------------------------------
    # Print out some information
    #-----------------------------------------------
    print(
        "\t Time required tor write restart and output files: %4g minutes %2g seconds \n"
        % (duration_run_writing.total_seconds() // 60.0,
           duration_run_writing.total_seconds() % 60.0))
    print("\t Total run duration: %4g minutes %2g seconds \n" %
          (duration_run.total_seconds() // 60.0,
           duration_run.total_seconds() % 60.0))
    print('--------------------------------------------------------------')
    print('\t SIMULATION WAS SUCCESSFUL')
    print('--------------------------------------------------------------')