Ejemplo n.º 1
0
async def client(cluster):
    async with Client(cluster, asynchronous=True) as client:
        yield client
Ejemplo n.º 2
0
def start_cluster(nb_workers, walltime, out_dir, timeout=600):
    """
    This function create a dask cluster.
    Each worker has nb_cpus cpus.
    Only one python process is started on each worker.

    Threads number:
    start_cluster will use OMP_NUM_THREADS environment variable to determine
    how many threads might be used by a worker when running C/C++ code.
    (default to 1)

    Workers number:
    start_cluster will use CARS_NB_WORKERS_PER_PBS_JOB environment variable
    to determine how many workers should be started by a single PBS job.
    (default to 1)

    Queue worker:
    start_cluster will use CARS_PBS_QUEUE to determine
    in which queue worker jobs should be posted.

    :param nb_workers: Number of dask workers
    :type nb_workers: int
    :param walltime: Walltime for each dask worker
    :type walltime: string
    :param out_dir: Output directory
    :type out_dir: string
    :return: Dask cluster and dask client
    :rtype: (dask_jobqueue.PBSCluster, dask.distributed.Client) tuple
    """
    # Retrieve multi-threading factor for C/C++ code if available
    omp_num_threads = 1
    if os.environ.get('OMP_NUM_THREADS'):
        omp_num_threads = int(os.environ['OMP_NUM_THREADS'])

    # Retrieve number of workers per PBS job
    nb_workers_per_job = 1
    if os.environ.get('CARS_NB_WORKERS_PER_PBS_JOB'):
        nb_workers_per_job = int(os.environ['CARS_NB_WORKERS_PER_PBS_JOB'])

    # Retrieve PBS queue
    pbs_queue = None
    if os.environ.get('CARS_PBS_QUEUE'):
        pbs_queue = os.environ['CARS_PBS_QUEUE']

    # Total number of cpus is multi-threading factor times size of batch
    # (number of workers per PBS job)
    nb_cpus = nb_workers_per_job * omp_num_threads
    # Cluster nodes have 5GB per core
    memory = nb_cpus * 5000
    # Ressource string for PBS
    resource = "select=1:ncpus={}:mem={}mb".format(nb_cpus, memory)

    nb_jobs = int(math.ceil(nb_workers / nb_workers_per_job))

    logging.info(
        "Starting Dask PBS cluster with {} workers "
        "({} workers with {} cores each per PSB job)".format(
        nb_workers,
        nb_workers_per_job, omp_num_threads))

    logging.info(
        "Submitting {} PBS jobs "
        "with configuration cpu={}, mem={}, walltime={}".format(
        nb_jobs,
        nb_cpus, memory, walltime))

    names = [
        'PATH',
        'PYTHONPATH',
        'CARS_STATIC_CONFIGURATION',
        'LD_LIBRARY_PATH',
        'OTB_APPLICATION_PATH',
        'OTB_GEOID_FILE',
        'OMP_NUM_THREADS',
        'NUMBA_NUM_THREADS',
        'OPJ_NUM_THREADS',
        'GDAL_NUM_THREADS',
        'OTB_MAX_RAM_HINT',
        'VIRTUAL_ENV',
        'ITK_GLOBAL_DEFAULT_NUMBER_OF_THREADS',
        'GDAL_CACHEMAX']
    names = [name for name in names if os.environ.get(name)]
    envs = ["export {}={}".format(name, os.environ[name]) for name in names]
    log_directory = os.path.join(os.path.abspath(out_dir), "dask_log")
    local_directory = '$TMPDIR'
    cluster = PBSCluster(
        processes=nb_workers_per_job,
        cores=nb_workers_per_job,
        resource_spec=resource,
        memory="{}MB".format(memory),
        local_directory=local_directory,
        project='dask-test',
        walltime=walltime,
        interface='ib0',
        queue=pbs_queue,
        env_extra=envs,
        log_directory=log_directory)
    logging.info("Dask cluster started")
    cluster.scale(nb_workers)
    client = Client(cluster, timeout=timeout)
    logging.info("Dashboard started at {}".format(get_dashboard_link(cluster)))
    return cluster, client
Ejemplo n.º 3
0
    def client(self):

        if self.scheduler:
            return self.scheduler.client
        else:
            return Client()
# import pop_tools
# import xoak
# import xesmf as xe

# dask jupyter lab packages
from dask.distributed import Client
# from dask.distributed import performance_report

# file name with time packages
# from itertools import product
# from cftime import DatetimeNoLeap
# ======================


# === incorporate dask ===
client = Client("tcp://10.73.1.1:36170")
client
# ========================


# === define parameters ===
# --- t12 ---
# FSm: Florida Strait Meridional section
# 292  292 1418 1443    1   42 merid  Florida Strait 
# reference: https://ncar.github.io/POP/doc/build/html/users_guide/model-diagnostics-and-output.html
ilon1_FSm_t12 = 292-1 
ilat1_FSm_t12, ilat2_FSm_t12 = 1418-1-1, 1443 
# -1 is because python index starts from 0 while Fortran index starts from 1
# CESM-POP subroutine diag_transport line # 2010-2255 at https://github.com/ESCOMP/POP2-CESM/blob/master/source/diagnostics.F90
# compute averages along nlat dim, so I read one point ahead of 1418, the two sides are on the lands (KMU=0, see codes below)
# -----------
Ejemplo n.º 5
0
def client():
    with Client(n_workers=2, threads_per_worker=1):
        yield client
Ejemplo n.º 6
0
def main():
    # Define parameters to use for multiprocessing
    client = Client(processes=False)
    num_workers = min(multiprocessing.cpu_count(), 7)
    print('Number of workers = ', num_workers)
    run_start_time = time.time()

    # Proposed changes:
    # 1) Increase government spending - increase by 2pp of GDP, starting in 2020 ending after 2022
    # 2) Cut corporate income tax rate to 25%, permanently
    # 3) Increase the standard deduction from 40,000 to 100,000, starting in 2020

    # Specify direct tax refrom
    dt_reform = {2020: {'_std_deduction': [100000]}}

    # Set some model parameters
    # See parameters.py for description of these parameters
    OG_params = {
        'alpha_G': [
            0.112, 0.132, 0.132, 0.132, 0.112
        ],  # specify policy from 2019 through 2023- assuming constant thereafter
        'tau_b':
        [0.27, 0.25 * 0.27 / 0.34],  # note the rate is the effective tax rate
        'age_specific': False,  # don't estimate age specific tax functions
        'tax_func_type': 'linear'
    }  # estimate linear marginal and effective tax rate functions
    '''
    ------------------------------------------------------------------------
    Run baseline policy first
    ------------------------------------------------------------------------
    '''
    output_base = BASELINE_DIR
    kwargs = {
        'output_base': output_base,
        'baseline_dir': BASELINE_DIR,
        'test': False,
        'time_path': True,
        'baseline': True,
        'user_params': {
            'age_specific': False,
            'tax_func_type': 'linear'
        },
        'guid': '_TPRU_19232019',
        'run_micro': True,
        'data': 'pitSmallData.csv',
        'client': client,
        'num_workers': num_workers
    }

    start_time = time.time()
    runner(**kwargs)
    print('run time = ', time.time() - start_time)
    '''
    ------------------------------------------------------------------------
    Run reform policy
    ------------------------------------------------------------------------
    '''
    output_base = REFORM_DIR
    kwargs = {
        'output_base': output_base,
        'baseline_dir': BASELINE_DIR,
        'test': False,
        'time_path': True,
        'baseline': False,
        'user_params': OG_params,
        'guid': '_TPRU_19232019_policy',
        'reform': dt_reform,
        'run_micro': True,
        'data': 'pitSmallData.csv',
        'client': client,
        'num_workers': num_workers
    }

    start_time = time.time()
    runner(**kwargs)
    print('run time = ', time.time() - start_time)

    # return ans - the percentage changes in macro aggregates and prices
    # due to policy changes from the baseline to the reform
    ans = postprocess.create_diff(baseline_dir=BASELINE_DIR,
                                  policy_dir=REFORM_DIR)

    print("total time was ", (time.time() - run_start_time))
    print('Percentage changes in aggregates:', ans)
Ejemplo n.º 7
0
 def runCrossValidate(self, verbose=False):
     self.logger.log("Cross-validate started...",
                     self.step_n,
                     message="Running cross validation")
     n_jobs = self.cv_n_jobs
     cv_results = {}
     new_cv_results = {}
     cv = self.getCV()
     # n_jobs = -1
     if verbose:
         logger.info(
             f"RunCrossValidate - n_jobs: {n_jobs}, scorer_list: {self.scorer_list}"
         )
     for pipe_name, model in self.model_dict.items():
         if verbose:
             logger.info(
                 f"RunCrossValidate - Running CV on pipe_name: {pipe_name}")
         start = time.time()
         dask_scheduler = os.getenv(
             "DASK_SCHEDULER", "tcp://" +
             socket.gethostbyname(socket.gethostname()) + ":8786")
         client = Client(dask_scheduler)
         with parallel_backend('dask', n_jobs=-1):  # 40min test case
             model_i = cross_validate(model,
                                      self.X_df,
                                      self.y_df.iloc[:, 0],
                                      return_estimator=True,
                                      scoring=self.scorer_list,
                                      cv=cv,
                                      n_jobs=1,
                                      verbose=3)
         end = time.time()
         if verbose:
             logger.info(
                 f"SCORES - {pipe_name},{[(scorer,np.mean(model_i[f'test_{scorer}'])) for scorer in self.scorer_list]}, runtime: {(end-start)/60} min."
             )
             logger.info(f"MODELS - {pipe_name},{model_i}")
         cv_results[pipe_name] = model_i
     if self.run_stacked:
         for est_name, result in cv_results.items():
             if type(result['estimator'][0]) is MultiPipe:
                 new_results = {}
                 for mp in result['estimator']:
                     for est_n, m in mp.build_individual_fitted_pipelines(
                     ).items():
                         if not est_n in new_results:
                             new_results[est_n] = []
                         new_results[est_n].append(m)
                 for est_n in new_results:
                     if est_n in cv_results:
                         est_n += '_fcombo'
                     new_cv_results[est_n] = {
                         'estimator': new_results[est_n]
                     }
         cv_results = {**new_cv_results, **cv_results}
         if verbose:
             logger.info("CV Results: {}".format(cv_results))
     self.cv_results = cv_results
     self.logger.log("Cross-validate complete.",
                     self.step_n,
                     message="Completed cross validation")
Ejemplo n.º 8
0
def ucx_client(ucx_cluster):
    client = Client(cluster)
    yield client
    client.close()
Ejemplo n.º 9
0
    DATA_URL = Parameter(
        "DATA_URL",
        default=
        "https://github.com/cicdw/image-data/blob/master/all-images.img?raw=true"
    )

    DATA_FILE = Parameter("DATA_FILE", default="image-data.img")

    with Flow("Image ETL") as flow:

        # Extract
        command = curl_cmd(DATA_URL, DATA_FILE)
        curl = download(command=command)

        # Transform
        # we use the `upstream_tasks` keyword to specify non-data dependencies
        images = load_and_split(fname=DATA_FILE, upstream_tasks=[curl])

        # Load
        frames = write_to_disk.map(images)
        result = combine_to_gif(frames)

    flow.visualize()

    # start our Dask cluster
    client = Client(n_workers=4, threads_per_worker=1)

    # point Prefect's DaskExecutor to our Dask cluster
    executor = DaskExecutor(address=client.scheduler.address)
    flow.run(executor=executor)
Ejemplo n.º 10
0
data_index = ['Seq Record ID', 'Resistance', 'Name', 'Genus', 'DNA Type', 'Strain', 'Bacteria Type', 'Notes']
# combining the the non-numerical and categorical labels for the pandas dataframe
data_index.extend(data_categories)

# Number of samples per species 
num_training_samples = 1000
# List of error testing rates
error_rate = [0]
# List of number of optical sequencing reads
num_reads = [1000000]

# Getting the list of all the data files that need to be tested
file_list = [file for file in os.listdir(local_SERS)]
    
# Intializing dask to run things in parallel
with LocalCluster(processes=False) as cluster, Client(cluster) as client:
    
# Cycling through the 24 combinations of error rate and number of reads
    for error in error_rate:
    # Recording the time it takes to run everything
        for reads in num_reads:
        # Getting the error and reads values in string form for file identification
            str_err = '_%s_' % (int(100*error))
            str_read = '_%s_' % (reads)
        # Getting the list of the files for the specific reads and errors
            working_genome_list = [file for file in file_list if str_err in file and str_read in file and 'Genome' in file]
        # Making sure that the lists aren't empty
            run_list = []    
            if len(working_genome_list) != 0:
                run_list.append(working_genome_list)
        
Ejemplo n.º 11
0
def client(cluster):
    client = Client(cluster)
    yield client
    client.close()
def test_mstumped(T, m, dask_cluster):
    with Client(dask_cluster) as dask_client:
        ref = stumpy.maamped(dask_client, T, m)
        comp = stumpy.mstumped(dask_client, T, m, normalize=False)
        npt.assert_almost_equal(ref, comp)
Ejemplo n.º 13
0
async def test_cluster_create(cluster):
    cluster.scale(1)
    await cluster
    async with Client(cluster, asynchronous=True) as client:
        result = await client.submit(lambda x: x + 1, 10)
        assert result == 11
Ejemplo n.º 14
0
async def test_start_with_workers(k8s_cluster, pod_spec):
    async with KubeCluster(pod_spec, n_workers=2, **cluster_kwargs) as cluster:
        async with Client(cluster, asynchronous=True) as client:
            while len(cluster.scheduler_info["workers"]) != 2:
                await asyncio.sleep(0.1)
Ejemplo n.º 15
0
import imageio
import numpy as np

from utoolbox.io.dataset import BigDataViewerDataset, LatticeScopeTiledDataset

if __name__ == "__main__":
    logging.getLogger("tifffile").setLevel(logging.ERROR)
    coloredlogs.install(level="DEBUG",
                        fmt="%(asctime)s %(levelname)s %(message)s",
                        datefmt="%H:%M:%S")

    logger = logging.getLogger(__name__)

    if True:
        cluster = LocalCluster(n_workers=4, threads_per_worker=4)
        client = Client(cluster)
    else:
        client = Client("10.109.20.6:8786")
    logger.info(client)

    src_ds = LatticeScopeTiledDataset(None)
    print(src_ds.inventory)

    logger.info(f"tile by {src_ds.tile_shape}")

    # import ipdb; ipdb.set_trace()

    z = src_ds.index.get_level_values("tile_z").unique().values
    mid_z = z[len(z) // 2]
    logger.info(f"mid z plane @ {mid_z}")
def main():
    # dask cluster and client
    n_processes = 1
    n_jobs = 35
    n_workers = n_processes * n_jobs

    cluster = SGECluster(
        interface="ib0",
        walltime="01:00:00",
        memory=f"64 G",
        resource_spec=f"h_vmem=64G",
        scheduler_options={
            "dashboard_address": ":5757",
        },
        job_extra=[
            "-cwd",
            "-V",
            f"-pe smp {n_processes}",
            f"-l disk=32G",
        ],
        local_directory=os.sep.join(
            [os.environ.get("PWD"), "dask-worker-scale-space"]),
    )

    client = Client(cluster)

    cluster.scale(jobs=n_jobs)

    time_start = time.time()

    # scale custom outputs
    if normal:
        emission_configs = np.array(
            np.meshgrid(
                np.linspace(0.0, 1.4, 8),
                np.linspace(0.0, 1.4, 8),
                np.linspace(0.0, 1.4, 8),
                np.linspace(0.0, 1.4, 8),
                np.linspace(0.0, 1.4, 8),
            )).T.reshape(-1, 5)
        emission_configs_20percentintervals = []
        for emission_config in emission_configs:
            emission_configs_20percentintervals.append(
                f'RES{round(emission_config[0], 1)}_IND{round(emission_config[1], 1)}_TRA{round(emission_config[2], 1)}_AGR{round(emission_config[3], 1)}_ENE{round(emission_config[4], 1)}'
            )

    if extra:
        custom_inputs_main = [
            np.array([[1.15, 1.27, 0.98, 0.98, 1.36]]),  # bottom-up 2010
            np.array([[1.19, 1.30, 1.01, 1.01, 1.46]]),  # bottom-up 2011
            np.array([[1.20, 1.30, 1.01, 1.02, 1.39]]),  # bottom-up 2012
            np.array([[1.13, 1.29, 1.02, 1.01, 1.29]]),  # bottom-up 2013
            np.array([[1.06, 1.12, 0.99, 1.01, 1.12]]),  # bottom-up 2014
            np.array([[0.92, 0.84, 0.97, 0.99, 0.94]]),  # bottom-up 2016
            np.array([[0.84, 0.81, 0.99, 0.99, 0.89]]),  # bottom-up 2017
            np.array([[0.76, 0.934, 0.735, 0.683, 0.708]]),
            np.array([[0.704, 0.786, 0.73, 0.659, 0.6]]),
            np.array([[0.712, 0.703, 0.725, 0.676, 0.649]]),
            np.array([[0.739, 0.668, 0.701, 0.686, 0.682]]),
            np.array([[0.67, 0.609, 0.709, 0.621, 0.661]]),
            np.array([[0.744, 0.904, 0.778, 0.678, 0.716]]),
            np.array([[0.771, 0.835, 0.711, 0.685, 0.544]]),
            np.array([[0.647, 0.945, 0.746, 0.588, 0.473]]),
            np.array([[0.657, 0.745, 0.714, 0.613, 0.591]]),
            np.array([[0.582, 0.7, 0.672, 0.5, 0.492]]),
            np.array([[0.803, 0.835, 0.742, 0.71, 0.717]]),
            np.array([[0.721, 0.863, 0.712, 0.74, 0.709]]),
            np.array([[0.661, 0.674, 0.694, 0.742, 0.715]]),
            np.array([[0.701, 0.642, 0.669, 0.681, 0.679]]),
            np.array([[0.604, 0.399, 0.659, 0.613, 0.724]]),
            np.array([[0.769, 1.009, 0.697, 0.69, 0.72]]),
            np.array([[0.824, 0.759, 0.767, 0.641, 0.429]]),
            np.array([[0.858, 1.092, 0.794, 0.604, 0.475]]),
            np.array([[0.8, 0.987, 0.648, 0.57, 0.493]]),
            np.array([[0.867, 0.957, 0.677, 0.558, 0.477]])
        ]
        custom_inputs = []
        for custom_input in custom_inputs_main:
            custom_input_res = np.copy(custom_input)
            custom_input_ind = np.copy(custom_input)
            custom_input_tra = np.copy(custom_input)
            custom_input_agr = np.copy(custom_input)
            custom_input_ene = np.copy(custom_input)
            custom_input_nores = np.copy(custom_input)
            custom_input_noind = np.copy(custom_input)
            custom_input_notra = np.copy(custom_input)
            custom_input_noagr = np.copy(custom_input)
            custom_input_noene = np.copy(custom_input)
            custom_input_resonly = np.copy(custom_input)
            custom_input_indonly = np.copy(custom_input)
            custom_input_traonly = np.copy(custom_input)
            custom_input_agronly = np.copy(custom_input)
            custom_input_eneonly = np.copy(custom_input)

            custom_input_res[0][1:] = 1.0
            custom_input_ind[0][0] = 1.0
            custom_input_ind[0][2:] = 1.0
            custom_input_tra[0][:2] = 1.0
            custom_input_tra[0][3:] = 1.0
            custom_input_agr[0][:3] = 1.0
            custom_input_agr[0][4:] = 1.0
            custom_input_ene[0][:4] = 1.0

            custom_input_nores[0][0] = 0.0
            custom_input_noind[0][1] = 0.0
            custom_input_notra[0][2] = 0.0
            custom_input_noagr[0][3] = 0.0
            custom_input_noene[0][4] = 0.0

            custom_input_resonly[0][1:] = 0.0
            custom_input_indonly[0][0] = 0.0
            custom_input_indonly[0][2:] = 0.0
            custom_input_traonly[0][:2] = 0.0
            custom_input_traonly[0][3:] = 0.0
            custom_input_agronly[0][:3] = 0.0
            custom_input_agronly[0][4:] = 0.0
            custom_input_eneonly[0][:4] = 0.0

            custom_inputs.append(custom_input)
            custom_inputs.append(custom_input_res)
            custom_inputs.append(custom_input_ind)
            custom_inputs.append(custom_input_tra)
            custom_inputs.append(custom_input_agr)
            custom_inputs.append(custom_input_ene)
            custom_inputs.append(custom_input_nores)
            custom_inputs.append(custom_input_noind)
            custom_inputs.append(custom_input_notra)
            custom_inputs.append(custom_input_noagr)
            custom_inputs.append(custom_input_noene)
            custom_inputs.append(custom_input_resonly)
            custom_inputs.append(custom_input_indonly)
            custom_inputs.append(custom_input_traonly)
            custom_inputs.append(custom_input_agronly)
            custom_inputs.append(custom_input_eneonly)

        emission_configs_20percentintervals = []
        for custom_input in custom_inputs:
            emission_config = f'RES{custom_input[0][0]:0.3f}_IND{custom_input[0][1]:0.3f}_TRA{custom_input[0][2]:0.3f}_AGR{custom_input[0][3]:0.3f}_ENE{custom_input[0][4]:0.3f}'
            emission_configs_20percentintervals.append(emission_config)

    if climate_cobenefits:
        custom_inputs_main = [
            np.array([[0.91, 0.95, 0.85, 1.05, 0.96]]),  # Base_CLE_2020
            np.array([[0.91, 0.95, 0.85, 1.05, 0.96]]),  # Base_MFR_2020
            np.array([[0.91, 0.95, 0.85, 1.05, 0.96]]),  # SDS_MFR_2020
            np.array([[0.68, 0.84, 0.71, 1.16, 0.93]]),  # Base_CLE_2030
            np.array([[0.33, 0.47, 0.48, 0.81, 0.69]]),  # Base_MFR_2030
            np.array([[0.27, 0.45, 0.41, 0.81, 0.55]]),  # SDS_MFR_2030
            np.array([[0.57, 0.75, 0.69, 1.2, 0.94]]),  # Base_CLE_2040
            np.array([[0.24, 0.41, 0.31, 0.83, 0.73]]),  # Base_MFR_2040
            np.array([[0.19, 0.38, 0.22, 0.83, 0.5]]),  # SDS_MFR_2040
            np.array([[0.52, 0.72, 0.65, 1.24, 0.91]]),  # Base_CLE_2050
            np.array([[0.2, 0.38, 0.29, 0.86, 0.72]]),  # Base_MFR_2050
            np.array([[0.18, 0.35, 0.2, 0.86, 0.46]]),  # SDS_MFR_2050
        ]
        custom_inputs = []
        for custom_input in custom_inputs_main:
            custom_input_res = np.copy(custom_input)
            custom_input_ind = np.copy(custom_input)
            custom_input_tra = np.copy(custom_input)
            custom_input_agr = np.copy(custom_input)
            custom_input_ene = np.copy(custom_input)
            custom_input_nores = np.copy(custom_input)
            custom_input_noind = np.copy(custom_input)
            custom_input_notra = np.copy(custom_input)
            custom_input_noagr = np.copy(custom_input)
            custom_input_noene = np.copy(custom_input)
            custom_input_resonly = np.copy(custom_input)
            custom_input_indonly = np.copy(custom_input)
            custom_input_traonly = np.copy(custom_input)
            custom_input_agronly = np.copy(custom_input)
            custom_input_eneonly = np.copy(custom_input)

            custom_input_res[0][1:] = 1.0
            custom_input_ind[0][0] = 1.0
            custom_input_ind[0][2:] = 1.0
            custom_input_tra[0][:2] = 1.0
            custom_input_tra[0][3:] = 1.0
            custom_input_agr[0][:3] = 1.0
            custom_input_agr[0][4:] = 1.0
            custom_input_ene[0][:4] = 1.0

            custom_input_nores[0][0] = 0.0
            custom_input_noind[0][1] = 0.0
            custom_input_notra[0][2] = 0.0
            custom_input_noagr[0][3] = 0.0
            custom_input_noene[0][4] = 0.0

            custom_input_resonly[0][1:] = 0.0
            custom_input_indonly[0][0] = 0.0
            custom_input_indonly[0][2:] = 0.0
            custom_input_traonly[0][:2] = 0.0
            custom_input_traonly[0][3:] = 0.0
            custom_input_agronly[0][:3] = 0.0
            custom_input_agronly[0][4:] = 0.0
            custom_input_eneonly[0][:4] = 0.0

            custom_inputs.append(custom_input)
            custom_inputs.append(custom_input_res)
            custom_inputs.append(custom_input_ind)
            custom_inputs.append(custom_input_tra)
            custom_inputs.append(custom_input_agr)
            custom_inputs.append(custom_input_ene)
            custom_inputs.append(custom_input_nores)
            custom_inputs.append(custom_input_noind)
            custom_inputs.append(custom_input_notra)
            custom_inputs.append(custom_input_noagr)
            custom_inputs.append(custom_input_noene)
            custom_inputs.append(custom_input_resonly)
            custom_inputs.append(custom_input_indonly)
            custom_inputs.append(custom_input_traonly)
            custom_inputs.append(custom_input_agronly)
            custom_inputs.append(custom_input_eneonly)

        emission_configs_20percentintervals = []
        for custom_input in custom_inputs:
            emission_config = f'RES{custom_input[0][0]:0.3f}_IND{custom_input[0][1]:0.3f}_TRA{custom_input[0][2]:0.3f}_AGR{custom_input[0][3]:0.3f}_ENE{custom_input[0][4]:0.3f}'
            emission_configs_20percentintervals.append(emission_config)

    if top_down_2020_baseline:
        emission_config_2020_baseline = np.array(
            [0.604, 0.399, 0.659, 0.613,
             0.724])  # matching to PM2.5 only, top 1,000
        emission_configs = np.array(
            np.meshgrid(
                np.linspace(
                    emission_config_2020_baseline[0] * 0.50,
                    emission_config_2020_baseline[0], 6
                ),  # 10% reduction increments from 2020 baseline up to 50%
                np.linspace(emission_config_2020_baseline[1] * 0.50,
                            emission_config_2020_baseline[1], 6),
                np.linspace(emission_config_2020_baseline[2] * 0.50,
                            emission_config_2020_baseline[2], 6),
                np.linspace(emission_config_2020_baseline[3] * 0.50,
                            emission_config_2020_baseline[3], 6),
                np.linspace(emission_config_2020_baseline[4] * 0.50,
                            emission_config_2020_baseline[4], 6),
            )).T.reshape(-1, 5)
        # add a couple more for larger reductions in RES and IND to reach WHO-IT2
        emission_configs = list(emission_configs)
        emission_configs.append(np.array([0.242, 0.160, 0.659, 0.613, 0.724]))
        emission_configs.append(np.array([0.181, 0.120, 0.659, 0.613, 0.724]))
        emission_configs.append(np.array([0.121, 0.080, 0.659, 0.613, 0.724]))
        emission_configs.append(np.array([0.060, 0.040, 0.659, 0.613, 0.724]))

        emission_configs_20percentintervals = []
        for emission_config in emission_configs:
            emission_configs_20percentintervals.append(
                f'RES{round(emission_config[0], 3):.3f}_IND{round(emission_config[1], 3):.3f}_TRA{round(emission_config[2], 3):.3f}_AGR{round(emission_config[3], 3):.3f}_ENE{round(emission_config[4], 3):.3f}'
            )

    emission_configs_completed = glob.glob(
        f"/nobackup/earlacoa/machinelearning/data_annual/predictions/{output}_adjusted/ds*{output}_popgrid_0.25deg_adjusted.nc"
    )
    emission_configs_completed = [
        f"{item[81:-38]}" for item in emission_configs_completed
    ]

    emission_configs_20percentintervals_remaining_set = set(
        emission_configs_20percentintervals) - set(emission_configs_completed)
    emission_configs_remaining = [
        item for item in emission_configs_20percentintervals_remaining_set
    ]
    print(
        f"custom outputs remaining for {output}: {len(emission_configs_remaining)} - 20% intervals with {int(100 * len(emission_configs_20percentintervals_remaining_set) / len(emission_configs_20percentintervals))}% remaining"
    )

    # dask bag and process
    emission_configs_remaining = emission_configs_remaining[:15000]
    print(
        f"predicting for {len(emission_configs_remaining)} custom outputs ...")
    bag_emission_configs = db.from_sequence(emission_configs_remaining,
                                            npartitions=n_workers)
    bag_emission_configs.map(adjust).compute()

    time_end = time.time() - time_start
    print(
        f"completed in {time_end:0.2f} seconds, or {time_end / 60:0.2f} minutes, or {time_end / 3600:0.2f} hours"
    )
    print(
        f"average time per custom output is {time_end / len(emission_configs_remaining):0.2f} seconds"
    )

    client.close()
    cluster.close()
Ejemplo n.º 17
0
#DATA = DATA_PARQUET_10_SNAPPY  # note 250MB partitions
#columns = ['date', 'pt', 'new'] # more efficient process
#client = Client(n_workers=32, threads_per_worker=1, processes=True, memory_limit='3GB')
# succeeds in 3s

#DATA = DATA_PARQUET_10_SNAPPY  # note 250MB partitions
#columns = ['date', 'pt', 'new'] # more efficient process
#client = Client(n_workers=10, threads_per_worker=1, processes=True, memory_limit='3GB')
# succeeds in 3s

if __name__ == "__main__":
    if 'client' not in dir():
        #client = Client(n_workers=32, threads_per_worker=1, processes=True, memory_limit='3GB')
        client = Client(n_workers=10,
                        threads_per_worker=1,
                        processes=True,
                        memory_limit='3GB')
        #client = Client(n_workers=2, threads_per_worker=1, processes=True, memory_limit='7GB')
        print(client)

    columns = ['date', 'pt', 'new']  # more efficient process
    #columns = None # used in conf talk
    DATA = DATA_PARQUET_10_SNAPPY
    #DATA = DATA_PARQUET_10_NOCOMPRESSION
    #DATA = DATA_PARQUET_100_SNAPPY
    ddf = dd.read_parquet(DATA, columns=columns)

    #ddf.head()

    #%%time
    t1 = time.time()
Ejemplo n.º 18
0
def index_f_and_f(dump_pk, user_pk):
    """
    Run all plugin for a new index on dask
    """
    dask_client = Client(settings.DASK_SCHEDULER_URL)
    fire_and_forget(dask_client.submit(unzip_then_run, dump_pk, user_pk))
Ejemplo n.º 19
0
def setup_workers(numCore):
    cluster = setup_cluster()
    from dask.distributed import Client
    client = Client(cluster)
    cluster.start_workers(numCore)
    return cluster, client
Ejemplo n.º 20
0
    conn.execute('''DROP INDEX IF EXISTS idx_movie2;''')
    conn.execute('''CREATE INDEX idx_movie1 ON moviePairs(movie1);''')
    conn.execute('''CREATE INDEX idx_movie2 ON moviePairs(movie2);''')
    conn.commit()

    for i in mps.itertuples(name=None):
        #print(i)
        conn.execute(
            '''INSERT INTO moviePairs(movie1,movie2,numPairs,score)
VALUES(?,?,?,?);''', (i[0][0], i[0][1], i[2], i[1]))
    conn.commit()


#Main
if __name__ == "__main__":
    client = Client(processes=False)
    print(client)

    fileRatings = "ml-1m/ratings.dat"
    fileNames = "ml-1m/movies.dat"

    moviePairSimilarities = computeMoviePairSimilarities(fileRatings)

    conn = sqlite3.connect('movieNamesDask.db')
    #moviePairsToSQL(conn,moviePairSimilarities)
    conn.close()

    #moviePairSimilarities.to_cvs("similarities.json")

    movieNames = pd.read_csv(fileNames,
                             names=["movieID", "title"],
Ejemplo n.º 21
0
        y_train,
        validation_data=(x_valid, y_valid),
        shuffle=True,
        batch_size=BATCHSIZE,
        epochs=EPOCHS,
        verbose=False,
    )

    # Evaluate the model accuracy on the validation set.
    score = model.evaluate(x_valid, y_valid, verbose=0)
    return score[1]


if __name__ == "__main__":

    with Client() as client:
        print(f"Dask dashboard is available at {client.dashboard_link}")

        storage = dask_optuna.DaskStorage()
        study = optuna.create_study(storage=storage, direction="maximize")
        with joblib.parallel_backend("dask"):
            study.optimize(objective, n_trials=20, n_jobs=-1)

        print("Number of trials: {}".format(len(study.trials)))

        print("Best trial:")
        trial = study.best_trial

        print("  Value: {}".format(trial.value))

        print("  Params: ")
Ejemplo n.º 22
0
import dask.array
from sympy import latex
from pathlib import Path

from dask.distributed import Client
from dask.distributed import LocalCluster
import bgc_md2.models.cable_all.cableCache as cC
import bgc_md2.models.cable_all.cablePaths as cP
import bgc_md2.models.cable_all.cableHelpers as cH

if __name__ == "__main__":
    if "cluster" not in dir():
        # cluster = LocalCluster(n_workers = 1)
        cluster = LocalCluster()

    client = Client(cluster)

# +
try:
    from ports.server_helpers import print_commands

    print_commands(cluster, local_port=8880)

except ImportError as e:
    pass  # module doesn't exist,dont make a fuss
# -

# chose the cable output directory you want to work with
cable_out_path = Path(
    "/home/data/cable-data/example_runs/parallel_1901_2004_with_spinup/output/new4"
)
Ejemplo n.º 23
0
import os
import shutil
import time
import schemas
from dask.distributed import Client
import dask.dataframe as dd

TABLE_NAME = os.environ['TABLE_NAME']
SOURCE = f'/data/raw/{TABLE_NAME}'
DESTINATION = f'/data/default/{TABLE_NAME}'

if __name__ == '__main__':
    # start the client
    client = Client()

    # find the files that exist
    fnames = [f'{SOURCE}/{fname}' for fname in os.listdir(SOURCE)]

    # read in existing data
    df = dd.read_json(fnames, dtype=getattr(schemas, TABLE_NAME))

    # add a column for partitions
    df['hour'] = df['timestamp'].dt.floor('h')

    # write out files
    start_time = time.time()
    df.to_parquet(DESTINATION,
                  engine='fastparquet',
                  append=True,
                  compression='gzip',
                  partition_on=['pair', 'hour'],
def full_pipeline_dask(job_name, train_function, eval_function, infer_function,
                       **kwargs):
    # original training
    if os.environ.get('FASTMRI_DEBUG'):
        n_epochs_train = 1
        n_epochs_fine_tune = 1
        n_eval_samples = 1
        n_inference_samples = 1
    else:
        n_epochs_train = 250
        n_epochs_fine_tune = 50
        n_eval_samples = 50
        n_inference_samples = None
    train_cluster = SLURMCluster(
        cores=1,
        job_cpu=20,
        memory='80GB',
        job_name=job_name,
        walltime='80:00:00',
        interface='ib0',
        job_extra=[
            f'--gres=gpu:1',
            '--qos=qos_gpu-t4',
            '--distribution=block:block',
            '--hint=nomultithread',
            '--output=%x_%j.out',
        ],
        env_extra=[
            'cd $WORK/fastmri-reproducible-benchmark',
            '. ./submission_scripts_jean_zay/env_config.sh',
        ],
    )
    train_cluster.scale(2)
    client = Client(train_cluster)
    acceleration_factors = [4, 8]
    futures = [
        client.submit(
            # function to execute
            train_function,
            af=af,
            n_epochs=n_epochs_train,
            **kwargs,
            # this function has potential side effects
            pure=True,
        ) for af in acceleration_factors
    ]
    run_ids = client.gather(futures)
    # fine tuning
    train_cluster.scale(4)
    contrasts = ['CORPDFS_FBK', 'CORPD_FBK']
    futures = []
    for af, run_id in zip(acceleration_factors, run_ids):
        for contrast in contrasts:
            futures += [
                client.submit(
                    # function to execute
                    train_function,
                    af=af,
                    contrast=contrast,
                    original_run_id=run_id,
                    n_epochs=n_epochs_fine_tune,
                    **kwargs,
                    # this function has potential side effects
                    pure=True,
                )
            ]
    fine_tuned_run_ids = client.gather(futures)
    client.close()
    train_cluster.close()
    # inference and eval
    inference_eval_cluster = SLURMCluster(
        cores=1,
        job_cpu=40,
        memory='80GB',
        job_name=job_name,
        walltime='20:00:00',
        interface='ib0',
        job_extra=[
            f'--gres=gpu:4',
            '--qos=qos_gpu-t3',
            '--distribution=block:block',
            '--hint=nomultithread',
            '--output=%x_%j.out',
        ],
        env_extra=[
            'cd $WORK/fastmri-reproducible-benchmark',
            '. ./submission_scripts_jean_zay/env_config.sh',
        ],
    )
    inference_eval_cluster.scale(8)
    client = Client(inference_eval_cluster)
    i_run_id = 0
    inference_futures = []
    eval_futures = []
    kwargs.pop('loss')
    for af in acceleration_factors:
        for contrast in contrasts:
            run_id = fine_tuned_run_ids[i_run_id]
            inference_futures += [
                client.submit(
                    # function to execute
                    infer_function,
                    contrast=contrast,
                    af=af,
                    run_id=run_id,
                    n_epochs=n_epochs_fine_tune,
                    n_samples=n_inference_samples,
                    exp_id=job_name,
                    **kwargs,
                    # this function has potential side effects
                    pure=True,
                )
            ]
            eval_futures += [
                client.submit(
                    # function to execute
                    eval_function,
                    contrast=contrast,
                    af=af,
                    run_id=run_id,
                    n_epochs=n_epochs_fine_tune,
                    n_samples=n_eval_samples,
                    **kwargs,
                    # this function has potential side effects
                    pure=True,
                )
            ]
            i_run_id += 1
    client.gather(inference_futures)
    # eval printing
    i_run_id = 0
    for af in acceleration_factors:
        for contrast in contrasts:
            metrics_names, eval_res = client.gather(eval_futures[i_run_id])
            print('AF', af)
            print('Contrast', contrast)
            print(metrics_names)
            print(eval_res)
            i_run_id += 1
    print('Shutting down dask workers')
    client.close()
    inference_eval_cluster.close()
Ejemplo n.º 25
0
def run_micro_macro(user_params):
    # Grab a reform JSON file already in Tax-Calculator
    # In this example the 'reform' is a change to 2017 law (the
    # baseline policy is tax law in 2018)
    reform_url = ('https://raw.githubusercontent.com/'
                  'PSLmodels/Tax-Calculator/master/taxcalc/'
                  'reforms/2017_law.json')
    #ref = Calculator.read_json_param_objects(reform_url, None) # Modified
    #reform = ref['policy'] # Modified

    # Define parameters to use for multiprocessing
    client = Client(processes=False)
    num_workers = 1  # multiprocessing.cpu_count()
    print('Number of workers = ', num_workers)
    start_time = time.time()

    # Set some model parameters
    # See parameters.py for description of these parameters
    alpha_T = np.ones(50) * 0.1230058215  # Modified
    alpha_G = np.ones(7) * 0.01234569933  # Modified
    small_open = False
    user_params = {
        'frisch': 0.5,
        'start_year': 2018,
        'tau_b': [(0.21 * 0.55) * (0.017 / 0.055),
                  (0.21 * 0.55) * (0.017 / 0.055)],
        'debt_ratio_ss': 2.0,
        'alpha_T': alpha_T.tolist(),
        'alpha_G': alpha_G.tolist(),
        'small_open': small_open
    }  # modified
    '''
    ------------------------------------------------------------------------
    Run baseline policy first
    ------------------------------------------------------------------------
    '''
    output_base = BASELINE_DIR
    kwargs = {
        'output_base': output_base,
        'baseline_dir': BASELINE_DIR,
        'test': False,
        'time_path': False,
        'baseline': True,
        'user_params': user_params,
        'guid': '_example',
        'run_micro': False,
        'data': 'cps',
        'client': client,
        'num_workers': num_workers
    }

    start_time = time.time()
    runner(**kwargs)
    print('run time = ', time.time() - start_time)
    '''
    ------------------------------------------------------------------------
    Run reform policy
    ------------------------------------------------------------------------
    '''
    user_params = {
        'frisch': 0.5,
        'start_year': 2018,
        'tau_b': [(0.35 * 0.55) * (0.017 / 0.055)],
        'debt_ratio_ss': 1.0,
        'alpha_T': alpha_T.tolist(),
        'alpha_G': alpha_G.tolist(),
        'small_open': small_open
    }
    output_base = REFORM_DIR
    kwargs = {
        'output_base': output_base,
        'baseline_dir': BASELINE_DIR,
        'test': False,
        'time_path': True,
        'baseline': False,
        'user_params': user_params,
        'guid': '_example',
        'reform': reform,
        'run_micro': True,
        'data': 'cps',
        'client': client,
        'num_workers': num_workers
    }

    start_time = time.time()
    runner(**kwargs)
    print('run time = ', time.time() - start_time)

    # return ans - the percentage changes in macro aggregates and prices
    # due to policy changes from the baseline to the reform
    ans = postprocess.create_diff(baseline_dir=BASELINE_DIR,
                                  policy_dir=REFORM_DIR)

    print("total time was ", (time.time() - start_time))
    print('Percentage changes in aggregates:', ans)
def train_eval_parameter_grid(job_name, train_function, eval_function,
                              parameter_grid):
    parameters = list(ParameterGrid(parameter_grid))
    n_parameters_config = len(parameters)
    train_cluster = SLURMCluster(
        cores=1,
        job_cpu=20,
        memory='80GB',
        job_name=job_name,
        walltime='60:00:00',
        interface='ib0',
        job_extra=[
            f'--gres=gpu:1',
            '--qos=qos_gpu-t4',
            '--distribution=block:block',
            '--hint=nomultithread',
            '--output=%x_%j.out',
        ],
        env_extra=[
            'cd $WORK/fastmri-reproducible-benchmark',
            '. ./submission_scripts_jean_zay/env_config.sh',
        ],
    )
    train_cluster.scale(n_parameters_config)
    client = Client(train_cluster)
    futures = [
        client.submit(
            # function to execute
            train_function,
            **params,
        ) for params in parameters
    ]
    run_ids = client.gather(futures)
    client.close()
    train_cluster.close()
    # eval
    eval_cluster = SLURMCluster(
        cores=1,
        job_cpu=40,
        memory='80GB',
        job_name=job_name,
        walltime='20:00:00',
        interface='ib0',
        job_extra=[
            f'--gres=gpu:4',
            '--qos=qos_gpu-t3',
            '--distribution=block:block',
            '--hint=nomultithread',
            '--output=%x_%j.out',
        ],
        env_extra=[
            'cd $WORK/fastmri-reproducible-benchmark',
            '. ./submission_scripts_jean_zay/env_config.sh',
        ],
    )
    eval_cluster.scale(n_parameters_config)
    client = Client(eval_cluster)
    for params in parameters:
        params.pop('n_samples')
    futures = [
        client.submit(
            # function to execute
            eval_function,
            run_id=run_id,
            n_samples=50,
            **params,
        ) for run_id, params in zip(run_ids, parameters)
    ]

    for params, future in zip(parameters, futures):
        metrics_names, eval_res = client.gather(future)
        print('Parameters', params)
        print(metrics_names)
        print(eval_res)
    print('Shutting down dask workers')
    client.close()
    eval_cluster.close()
Ejemplo n.º 27
0
    def init(self):
        ### initializing required classes
        self._execute_notebook_class = ExecuteNotebookWriter(self)
        self._make_site_class = MakeSiteWriter(self)
        self.executedir = self.outdir + '/executed'
        self.reportdir = self.outdir + '/reports/'
        self.errordir = self.outdir + "/reports/{}"
        self.downloadsdir = self.outdir + "/_downloads"
        self.downloadsExecutedir = self.downloadsdir + "/executed"
        self.client = None

        # Check default language is defined in the jupyter kernels
        def_lng = self.config["jupyter_default_lang"]
        if  def_lng not in self.config["jupyter_kernels"]:
            self.logger.warning(
                "Default language defined in conf.py ({}) is not "
                "defined in the jupyter_kernels in conf.py. "
                "Set default language to python3"
                .format(def_lng))
            self.config["jupyter_default_lang"] = "python3"
        # If the user has overridden anything on the command line, set these things which have been overridden.
        instructions = []
        overrides = self.config['jupyter_options']
        if overrides:
            instructions = overrides.split(",")

        for instruction in instructions:
            if instruction:
                if instruction == 'code_only':
                    self.config["jupyter_conversion_mode"] = "code"
                else:
                    # Fail on unrecognised command.
                    self.logger.warning("Unrecognise command line parameter " + instruction + ", ignoring.")

        #threads per worker for dask distributed processing
        if "jupyter_threads_per_worker" in self.config:
            self.threads_per_worker = self.config["jupyter_threads_per_worker"]

        #number of workers for dask distributed processing
        if "jupyter_number_workers" in self.config:
            self.n_workers = self.config["jupyter_number_workers"]

        # start a dask client to process the notebooks efficiently. 
        # processes = False. This is sometimes preferable if you want to avoid inter-worker communication and your computations release the GIL. This is common when primarily using NumPy or Dask Array.
        
        if (self.config["jupyter_execute_notebooks"]):
            self.client = Client(processes=False, threads_per_worker = self.threads_per_worker, n_workers = self.n_workers)
            self.execution_vars = {
                'target': 'website',
                'dependency_lists': self.config["jupyter_dependency_lists"],
                'executed_notebooks': [],
                'delayed_notebooks': dict(),
                'futures': [],
                'delayed_futures': [],
                'destination': self.executedir
            }
        
        if (self.config["jupyter_download_nb_execute"]):
            if self.client is None:
                self.client = Client(processes=False, threads_per_worker = self.threads_per_worker, n_workers = self.n_workers)
            self.download_execution_vars = {
                'target': 'downloads',
                'dependency_lists': self.config["jupyter_dependency_lists"],
                'executed_notebooks': [],
                'delayed_notebooks': dict(),
                'futures': [],
                'delayed_futures': [],
                'destination': self.downloadsExecutedir
            }
Ejemplo n.º 28
0
def client(cluster):
    with Client(cluster) as client:
        yield client
Ejemplo n.º 29
0
 def test_empty_dmatrix(self):
     with LocalCUDACluster() as cluster:
         with Client(cluster) as client:
             parameters = {'tree_method': 'gpu_hist'}
             run_empty_dmatrix(client, parameters)
Ejemplo n.º 30
0
        }
        exe = processor.futures_executor

    elif iterative:
        exe_args = {
            'function_args': {'flatten': False},
            "schema": NanoAODSchema,
        }
        exe = processor.iterative_executor

    else:
        from Tools.helpers import get_scheduler_address
        from dask.distributed import Client, progress

        scheduler_address = get_scheduler_address()
        c = Client(scheduler_address)

        exe_args = {
            'client': c,
            'function_args': {'flatten': False},
            "schema": NanoAODSchema,
            "tailtimeout": 300,
            "retries": 3,
            "skipbadfiles": True
        }
        exe = processor.dask_executor

    # add some histograms that we defined in the processor
    # everything else is taken the default_accumulators.py
    from processor.default_accumulators import multiplicity_axis, dataset_axis, score_axis, pt_axis, ht_axis
    desired_output.update({