async def client(cluster): async with Client(cluster, asynchronous=True) as client: yield client
def start_cluster(nb_workers, walltime, out_dir, timeout=600): """ This function create a dask cluster. Each worker has nb_cpus cpus. Only one python process is started on each worker. Threads number: start_cluster will use OMP_NUM_THREADS environment variable to determine how many threads might be used by a worker when running C/C++ code. (default to 1) Workers number: start_cluster will use CARS_NB_WORKERS_PER_PBS_JOB environment variable to determine how many workers should be started by a single PBS job. (default to 1) Queue worker: start_cluster will use CARS_PBS_QUEUE to determine in which queue worker jobs should be posted. :param nb_workers: Number of dask workers :type nb_workers: int :param walltime: Walltime for each dask worker :type walltime: string :param out_dir: Output directory :type out_dir: string :return: Dask cluster and dask client :rtype: (dask_jobqueue.PBSCluster, dask.distributed.Client) tuple """ # Retrieve multi-threading factor for C/C++ code if available omp_num_threads = 1 if os.environ.get('OMP_NUM_THREADS'): omp_num_threads = int(os.environ['OMP_NUM_THREADS']) # Retrieve number of workers per PBS job nb_workers_per_job = 1 if os.environ.get('CARS_NB_WORKERS_PER_PBS_JOB'): nb_workers_per_job = int(os.environ['CARS_NB_WORKERS_PER_PBS_JOB']) # Retrieve PBS queue pbs_queue = None if os.environ.get('CARS_PBS_QUEUE'): pbs_queue = os.environ['CARS_PBS_QUEUE'] # Total number of cpus is multi-threading factor times size of batch # (number of workers per PBS job) nb_cpus = nb_workers_per_job * omp_num_threads # Cluster nodes have 5GB per core memory = nb_cpus * 5000 # Ressource string for PBS resource = "select=1:ncpus={}:mem={}mb".format(nb_cpus, memory) nb_jobs = int(math.ceil(nb_workers / nb_workers_per_job)) logging.info( "Starting Dask PBS cluster with {} workers " "({} workers with {} cores each per PSB job)".format( nb_workers, nb_workers_per_job, omp_num_threads)) logging.info( "Submitting {} PBS jobs " "with configuration cpu={}, mem={}, walltime={}".format( nb_jobs, nb_cpus, memory, walltime)) names = [ 'PATH', 'PYTHONPATH', 'CARS_STATIC_CONFIGURATION', 'LD_LIBRARY_PATH', 'OTB_APPLICATION_PATH', 'OTB_GEOID_FILE', 'OMP_NUM_THREADS', 'NUMBA_NUM_THREADS', 'OPJ_NUM_THREADS', 'GDAL_NUM_THREADS', 'OTB_MAX_RAM_HINT', 'VIRTUAL_ENV', 'ITK_GLOBAL_DEFAULT_NUMBER_OF_THREADS', 'GDAL_CACHEMAX'] names = [name for name in names if os.environ.get(name)] envs = ["export {}={}".format(name, os.environ[name]) for name in names] log_directory = os.path.join(os.path.abspath(out_dir), "dask_log") local_directory = '$TMPDIR' cluster = PBSCluster( processes=nb_workers_per_job, cores=nb_workers_per_job, resource_spec=resource, memory="{}MB".format(memory), local_directory=local_directory, project='dask-test', walltime=walltime, interface='ib0', queue=pbs_queue, env_extra=envs, log_directory=log_directory) logging.info("Dask cluster started") cluster.scale(nb_workers) client = Client(cluster, timeout=timeout) logging.info("Dashboard started at {}".format(get_dashboard_link(cluster))) return cluster, client
def client(self): if self.scheduler: return self.scheduler.client else: return Client()
# import pop_tools # import xoak # import xesmf as xe # dask jupyter lab packages from dask.distributed import Client # from dask.distributed import performance_report # file name with time packages # from itertools import product # from cftime import DatetimeNoLeap # ====================== # === incorporate dask === client = Client("tcp://10.73.1.1:36170") client # ======================== # === define parameters === # --- t12 --- # FSm: Florida Strait Meridional section # 292 292 1418 1443 1 42 merid Florida Strait # reference: https://ncar.github.io/POP/doc/build/html/users_guide/model-diagnostics-and-output.html ilon1_FSm_t12 = 292-1 ilat1_FSm_t12, ilat2_FSm_t12 = 1418-1-1, 1443 # -1 is because python index starts from 0 while Fortran index starts from 1 # CESM-POP subroutine diag_transport line # 2010-2255 at https://github.com/ESCOMP/POP2-CESM/blob/master/source/diagnostics.F90 # compute averages along nlat dim, so I read one point ahead of 1418, the two sides are on the lands (KMU=0, see codes below) # -----------
def client(): with Client(n_workers=2, threads_per_worker=1): yield client
def main(): # Define parameters to use for multiprocessing client = Client(processes=False) num_workers = min(multiprocessing.cpu_count(), 7) print('Number of workers = ', num_workers) run_start_time = time.time() # Proposed changes: # 1) Increase government spending - increase by 2pp of GDP, starting in 2020 ending after 2022 # 2) Cut corporate income tax rate to 25%, permanently # 3) Increase the standard deduction from 40,000 to 100,000, starting in 2020 # Specify direct tax refrom dt_reform = {2020: {'_std_deduction': [100000]}} # Set some model parameters # See parameters.py for description of these parameters OG_params = { 'alpha_G': [ 0.112, 0.132, 0.132, 0.132, 0.112 ], # specify policy from 2019 through 2023- assuming constant thereafter 'tau_b': [0.27, 0.25 * 0.27 / 0.34], # note the rate is the effective tax rate 'age_specific': False, # don't estimate age specific tax functions 'tax_func_type': 'linear' } # estimate linear marginal and effective tax rate functions ''' ------------------------------------------------------------------------ Run baseline policy first ------------------------------------------------------------------------ ''' output_base = BASELINE_DIR kwargs = { 'output_base': output_base, 'baseline_dir': BASELINE_DIR, 'test': False, 'time_path': True, 'baseline': True, 'user_params': { 'age_specific': False, 'tax_func_type': 'linear' }, 'guid': '_TPRU_19232019', 'run_micro': True, 'data': 'pitSmallData.csv', 'client': client, 'num_workers': num_workers } start_time = time.time() runner(**kwargs) print('run time = ', time.time() - start_time) ''' ------------------------------------------------------------------------ Run reform policy ------------------------------------------------------------------------ ''' output_base = REFORM_DIR kwargs = { 'output_base': output_base, 'baseline_dir': BASELINE_DIR, 'test': False, 'time_path': True, 'baseline': False, 'user_params': OG_params, 'guid': '_TPRU_19232019_policy', 'reform': dt_reform, 'run_micro': True, 'data': 'pitSmallData.csv', 'client': client, 'num_workers': num_workers } start_time = time.time() runner(**kwargs) print('run time = ', time.time() - start_time) # return ans - the percentage changes in macro aggregates and prices # due to policy changes from the baseline to the reform ans = postprocess.create_diff(baseline_dir=BASELINE_DIR, policy_dir=REFORM_DIR) print("total time was ", (time.time() - run_start_time)) print('Percentage changes in aggregates:', ans)
def runCrossValidate(self, verbose=False): self.logger.log("Cross-validate started...", self.step_n, message="Running cross validation") n_jobs = self.cv_n_jobs cv_results = {} new_cv_results = {} cv = self.getCV() # n_jobs = -1 if verbose: logger.info( f"RunCrossValidate - n_jobs: {n_jobs}, scorer_list: {self.scorer_list}" ) for pipe_name, model in self.model_dict.items(): if verbose: logger.info( f"RunCrossValidate - Running CV on pipe_name: {pipe_name}") start = time.time() dask_scheduler = os.getenv( "DASK_SCHEDULER", "tcp://" + socket.gethostbyname(socket.gethostname()) + ":8786") client = Client(dask_scheduler) with parallel_backend('dask', n_jobs=-1): # 40min test case model_i = cross_validate(model, self.X_df, self.y_df.iloc[:, 0], return_estimator=True, scoring=self.scorer_list, cv=cv, n_jobs=1, verbose=3) end = time.time() if verbose: logger.info( f"SCORES - {pipe_name},{[(scorer,np.mean(model_i[f'test_{scorer}'])) for scorer in self.scorer_list]}, runtime: {(end-start)/60} min." ) logger.info(f"MODELS - {pipe_name},{model_i}") cv_results[pipe_name] = model_i if self.run_stacked: for est_name, result in cv_results.items(): if type(result['estimator'][0]) is MultiPipe: new_results = {} for mp in result['estimator']: for est_n, m in mp.build_individual_fitted_pipelines( ).items(): if not est_n in new_results: new_results[est_n] = [] new_results[est_n].append(m) for est_n in new_results: if est_n in cv_results: est_n += '_fcombo' new_cv_results[est_n] = { 'estimator': new_results[est_n] } cv_results = {**new_cv_results, **cv_results} if verbose: logger.info("CV Results: {}".format(cv_results)) self.cv_results = cv_results self.logger.log("Cross-validate complete.", self.step_n, message="Completed cross validation")
def ucx_client(ucx_cluster): client = Client(cluster) yield client client.close()
DATA_URL = Parameter( "DATA_URL", default= "https://github.com/cicdw/image-data/blob/master/all-images.img?raw=true" ) DATA_FILE = Parameter("DATA_FILE", default="image-data.img") with Flow("Image ETL") as flow: # Extract command = curl_cmd(DATA_URL, DATA_FILE) curl = download(command=command) # Transform # we use the `upstream_tasks` keyword to specify non-data dependencies images = load_and_split(fname=DATA_FILE, upstream_tasks=[curl]) # Load frames = write_to_disk.map(images) result = combine_to_gif(frames) flow.visualize() # start our Dask cluster client = Client(n_workers=4, threads_per_worker=1) # point Prefect's DaskExecutor to our Dask cluster executor = DaskExecutor(address=client.scheduler.address) flow.run(executor=executor)
data_index = ['Seq Record ID', 'Resistance', 'Name', 'Genus', 'DNA Type', 'Strain', 'Bacteria Type', 'Notes'] # combining the the non-numerical and categorical labels for the pandas dataframe data_index.extend(data_categories) # Number of samples per species num_training_samples = 1000 # List of error testing rates error_rate = [0] # List of number of optical sequencing reads num_reads = [1000000] # Getting the list of all the data files that need to be tested file_list = [file for file in os.listdir(local_SERS)] # Intializing dask to run things in parallel with LocalCluster(processes=False) as cluster, Client(cluster) as client: # Cycling through the 24 combinations of error rate and number of reads for error in error_rate: # Recording the time it takes to run everything for reads in num_reads: # Getting the error and reads values in string form for file identification str_err = '_%s_' % (int(100*error)) str_read = '_%s_' % (reads) # Getting the list of the files for the specific reads and errors working_genome_list = [file for file in file_list if str_err in file and str_read in file and 'Genome' in file] # Making sure that the lists aren't empty run_list = [] if len(working_genome_list) != 0: run_list.append(working_genome_list)
def client(cluster): client = Client(cluster) yield client client.close()
def test_mstumped(T, m, dask_cluster): with Client(dask_cluster) as dask_client: ref = stumpy.maamped(dask_client, T, m) comp = stumpy.mstumped(dask_client, T, m, normalize=False) npt.assert_almost_equal(ref, comp)
async def test_cluster_create(cluster): cluster.scale(1) await cluster async with Client(cluster, asynchronous=True) as client: result = await client.submit(lambda x: x + 1, 10) assert result == 11
async def test_start_with_workers(k8s_cluster, pod_spec): async with KubeCluster(pod_spec, n_workers=2, **cluster_kwargs) as cluster: async with Client(cluster, asynchronous=True) as client: while len(cluster.scheduler_info["workers"]) != 2: await asyncio.sleep(0.1)
import imageio import numpy as np from utoolbox.io.dataset import BigDataViewerDataset, LatticeScopeTiledDataset if __name__ == "__main__": logging.getLogger("tifffile").setLevel(logging.ERROR) coloredlogs.install(level="DEBUG", fmt="%(asctime)s %(levelname)s %(message)s", datefmt="%H:%M:%S") logger = logging.getLogger(__name__) if True: cluster = LocalCluster(n_workers=4, threads_per_worker=4) client = Client(cluster) else: client = Client("10.109.20.6:8786") logger.info(client) src_ds = LatticeScopeTiledDataset(None) print(src_ds.inventory) logger.info(f"tile by {src_ds.tile_shape}") # import ipdb; ipdb.set_trace() z = src_ds.index.get_level_values("tile_z").unique().values mid_z = z[len(z) // 2] logger.info(f"mid z plane @ {mid_z}")
def main(): # dask cluster and client n_processes = 1 n_jobs = 35 n_workers = n_processes * n_jobs cluster = SGECluster( interface="ib0", walltime="01:00:00", memory=f"64 G", resource_spec=f"h_vmem=64G", scheduler_options={ "dashboard_address": ":5757", }, job_extra=[ "-cwd", "-V", f"-pe smp {n_processes}", f"-l disk=32G", ], local_directory=os.sep.join( [os.environ.get("PWD"), "dask-worker-scale-space"]), ) client = Client(cluster) cluster.scale(jobs=n_jobs) time_start = time.time() # scale custom outputs if normal: emission_configs = np.array( np.meshgrid( np.linspace(0.0, 1.4, 8), np.linspace(0.0, 1.4, 8), np.linspace(0.0, 1.4, 8), np.linspace(0.0, 1.4, 8), np.linspace(0.0, 1.4, 8), )).T.reshape(-1, 5) emission_configs_20percentintervals = [] for emission_config in emission_configs: emission_configs_20percentintervals.append( f'RES{round(emission_config[0], 1)}_IND{round(emission_config[1], 1)}_TRA{round(emission_config[2], 1)}_AGR{round(emission_config[3], 1)}_ENE{round(emission_config[4], 1)}' ) if extra: custom_inputs_main = [ np.array([[1.15, 1.27, 0.98, 0.98, 1.36]]), # bottom-up 2010 np.array([[1.19, 1.30, 1.01, 1.01, 1.46]]), # bottom-up 2011 np.array([[1.20, 1.30, 1.01, 1.02, 1.39]]), # bottom-up 2012 np.array([[1.13, 1.29, 1.02, 1.01, 1.29]]), # bottom-up 2013 np.array([[1.06, 1.12, 0.99, 1.01, 1.12]]), # bottom-up 2014 np.array([[0.92, 0.84, 0.97, 0.99, 0.94]]), # bottom-up 2016 np.array([[0.84, 0.81, 0.99, 0.99, 0.89]]), # bottom-up 2017 np.array([[0.76, 0.934, 0.735, 0.683, 0.708]]), np.array([[0.704, 0.786, 0.73, 0.659, 0.6]]), np.array([[0.712, 0.703, 0.725, 0.676, 0.649]]), np.array([[0.739, 0.668, 0.701, 0.686, 0.682]]), np.array([[0.67, 0.609, 0.709, 0.621, 0.661]]), np.array([[0.744, 0.904, 0.778, 0.678, 0.716]]), np.array([[0.771, 0.835, 0.711, 0.685, 0.544]]), np.array([[0.647, 0.945, 0.746, 0.588, 0.473]]), np.array([[0.657, 0.745, 0.714, 0.613, 0.591]]), np.array([[0.582, 0.7, 0.672, 0.5, 0.492]]), np.array([[0.803, 0.835, 0.742, 0.71, 0.717]]), np.array([[0.721, 0.863, 0.712, 0.74, 0.709]]), np.array([[0.661, 0.674, 0.694, 0.742, 0.715]]), np.array([[0.701, 0.642, 0.669, 0.681, 0.679]]), np.array([[0.604, 0.399, 0.659, 0.613, 0.724]]), np.array([[0.769, 1.009, 0.697, 0.69, 0.72]]), np.array([[0.824, 0.759, 0.767, 0.641, 0.429]]), np.array([[0.858, 1.092, 0.794, 0.604, 0.475]]), np.array([[0.8, 0.987, 0.648, 0.57, 0.493]]), np.array([[0.867, 0.957, 0.677, 0.558, 0.477]]) ] custom_inputs = [] for custom_input in custom_inputs_main: custom_input_res = np.copy(custom_input) custom_input_ind = np.copy(custom_input) custom_input_tra = np.copy(custom_input) custom_input_agr = np.copy(custom_input) custom_input_ene = np.copy(custom_input) custom_input_nores = np.copy(custom_input) custom_input_noind = np.copy(custom_input) custom_input_notra = np.copy(custom_input) custom_input_noagr = np.copy(custom_input) custom_input_noene = np.copy(custom_input) custom_input_resonly = np.copy(custom_input) custom_input_indonly = np.copy(custom_input) custom_input_traonly = np.copy(custom_input) custom_input_agronly = np.copy(custom_input) custom_input_eneonly = np.copy(custom_input) custom_input_res[0][1:] = 1.0 custom_input_ind[0][0] = 1.0 custom_input_ind[0][2:] = 1.0 custom_input_tra[0][:2] = 1.0 custom_input_tra[0][3:] = 1.0 custom_input_agr[0][:3] = 1.0 custom_input_agr[0][4:] = 1.0 custom_input_ene[0][:4] = 1.0 custom_input_nores[0][0] = 0.0 custom_input_noind[0][1] = 0.0 custom_input_notra[0][2] = 0.0 custom_input_noagr[0][3] = 0.0 custom_input_noene[0][4] = 0.0 custom_input_resonly[0][1:] = 0.0 custom_input_indonly[0][0] = 0.0 custom_input_indonly[0][2:] = 0.0 custom_input_traonly[0][:2] = 0.0 custom_input_traonly[0][3:] = 0.0 custom_input_agronly[0][:3] = 0.0 custom_input_agronly[0][4:] = 0.0 custom_input_eneonly[0][:4] = 0.0 custom_inputs.append(custom_input) custom_inputs.append(custom_input_res) custom_inputs.append(custom_input_ind) custom_inputs.append(custom_input_tra) custom_inputs.append(custom_input_agr) custom_inputs.append(custom_input_ene) custom_inputs.append(custom_input_nores) custom_inputs.append(custom_input_noind) custom_inputs.append(custom_input_notra) custom_inputs.append(custom_input_noagr) custom_inputs.append(custom_input_noene) custom_inputs.append(custom_input_resonly) custom_inputs.append(custom_input_indonly) custom_inputs.append(custom_input_traonly) custom_inputs.append(custom_input_agronly) custom_inputs.append(custom_input_eneonly) emission_configs_20percentintervals = [] for custom_input in custom_inputs: emission_config = f'RES{custom_input[0][0]:0.3f}_IND{custom_input[0][1]:0.3f}_TRA{custom_input[0][2]:0.3f}_AGR{custom_input[0][3]:0.3f}_ENE{custom_input[0][4]:0.3f}' emission_configs_20percentintervals.append(emission_config) if climate_cobenefits: custom_inputs_main = [ np.array([[0.91, 0.95, 0.85, 1.05, 0.96]]), # Base_CLE_2020 np.array([[0.91, 0.95, 0.85, 1.05, 0.96]]), # Base_MFR_2020 np.array([[0.91, 0.95, 0.85, 1.05, 0.96]]), # SDS_MFR_2020 np.array([[0.68, 0.84, 0.71, 1.16, 0.93]]), # Base_CLE_2030 np.array([[0.33, 0.47, 0.48, 0.81, 0.69]]), # Base_MFR_2030 np.array([[0.27, 0.45, 0.41, 0.81, 0.55]]), # SDS_MFR_2030 np.array([[0.57, 0.75, 0.69, 1.2, 0.94]]), # Base_CLE_2040 np.array([[0.24, 0.41, 0.31, 0.83, 0.73]]), # Base_MFR_2040 np.array([[0.19, 0.38, 0.22, 0.83, 0.5]]), # SDS_MFR_2040 np.array([[0.52, 0.72, 0.65, 1.24, 0.91]]), # Base_CLE_2050 np.array([[0.2, 0.38, 0.29, 0.86, 0.72]]), # Base_MFR_2050 np.array([[0.18, 0.35, 0.2, 0.86, 0.46]]), # SDS_MFR_2050 ] custom_inputs = [] for custom_input in custom_inputs_main: custom_input_res = np.copy(custom_input) custom_input_ind = np.copy(custom_input) custom_input_tra = np.copy(custom_input) custom_input_agr = np.copy(custom_input) custom_input_ene = np.copy(custom_input) custom_input_nores = np.copy(custom_input) custom_input_noind = np.copy(custom_input) custom_input_notra = np.copy(custom_input) custom_input_noagr = np.copy(custom_input) custom_input_noene = np.copy(custom_input) custom_input_resonly = np.copy(custom_input) custom_input_indonly = np.copy(custom_input) custom_input_traonly = np.copy(custom_input) custom_input_agronly = np.copy(custom_input) custom_input_eneonly = np.copy(custom_input) custom_input_res[0][1:] = 1.0 custom_input_ind[0][0] = 1.0 custom_input_ind[0][2:] = 1.0 custom_input_tra[0][:2] = 1.0 custom_input_tra[0][3:] = 1.0 custom_input_agr[0][:3] = 1.0 custom_input_agr[0][4:] = 1.0 custom_input_ene[0][:4] = 1.0 custom_input_nores[0][0] = 0.0 custom_input_noind[0][1] = 0.0 custom_input_notra[0][2] = 0.0 custom_input_noagr[0][3] = 0.0 custom_input_noene[0][4] = 0.0 custom_input_resonly[0][1:] = 0.0 custom_input_indonly[0][0] = 0.0 custom_input_indonly[0][2:] = 0.0 custom_input_traonly[0][:2] = 0.0 custom_input_traonly[0][3:] = 0.0 custom_input_agronly[0][:3] = 0.0 custom_input_agronly[0][4:] = 0.0 custom_input_eneonly[0][:4] = 0.0 custom_inputs.append(custom_input) custom_inputs.append(custom_input_res) custom_inputs.append(custom_input_ind) custom_inputs.append(custom_input_tra) custom_inputs.append(custom_input_agr) custom_inputs.append(custom_input_ene) custom_inputs.append(custom_input_nores) custom_inputs.append(custom_input_noind) custom_inputs.append(custom_input_notra) custom_inputs.append(custom_input_noagr) custom_inputs.append(custom_input_noene) custom_inputs.append(custom_input_resonly) custom_inputs.append(custom_input_indonly) custom_inputs.append(custom_input_traonly) custom_inputs.append(custom_input_agronly) custom_inputs.append(custom_input_eneonly) emission_configs_20percentintervals = [] for custom_input in custom_inputs: emission_config = f'RES{custom_input[0][0]:0.3f}_IND{custom_input[0][1]:0.3f}_TRA{custom_input[0][2]:0.3f}_AGR{custom_input[0][3]:0.3f}_ENE{custom_input[0][4]:0.3f}' emission_configs_20percentintervals.append(emission_config) if top_down_2020_baseline: emission_config_2020_baseline = np.array( [0.604, 0.399, 0.659, 0.613, 0.724]) # matching to PM2.5 only, top 1,000 emission_configs = np.array( np.meshgrid( np.linspace( emission_config_2020_baseline[0] * 0.50, emission_config_2020_baseline[0], 6 ), # 10% reduction increments from 2020 baseline up to 50% np.linspace(emission_config_2020_baseline[1] * 0.50, emission_config_2020_baseline[1], 6), np.linspace(emission_config_2020_baseline[2] * 0.50, emission_config_2020_baseline[2], 6), np.linspace(emission_config_2020_baseline[3] * 0.50, emission_config_2020_baseline[3], 6), np.linspace(emission_config_2020_baseline[4] * 0.50, emission_config_2020_baseline[4], 6), )).T.reshape(-1, 5) # add a couple more for larger reductions in RES and IND to reach WHO-IT2 emission_configs = list(emission_configs) emission_configs.append(np.array([0.242, 0.160, 0.659, 0.613, 0.724])) emission_configs.append(np.array([0.181, 0.120, 0.659, 0.613, 0.724])) emission_configs.append(np.array([0.121, 0.080, 0.659, 0.613, 0.724])) emission_configs.append(np.array([0.060, 0.040, 0.659, 0.613, 0.724])) emission_configs_20percentintervals = [] for emission_config in emission_configs: emission_configs_20percentintervals.append( f'RES{round(emission_config[0], 3):.3f}_IND{round(emission_config[1], 3):.3f}_TRA{round(emission_config[2], 3):.3f}_AGR{round(emission_config[3], 3):.3f}_ENE{round(emission_config[4], 3):.3f}' ) emission_configs_completed = glob.glob( f"/nobackup/earlacoa/machinelearning/data_annual/predictions/{output}_adjusted/ds*{output}_popgrid_0.25deg_adjusted.nc" ) emission_configs_completed = [ f"{item[81:-38]}" for item in emission_configs_completed ] emission_configs_20percentintervals_remaining_set = set( emission_configs_20percentintervals) - set(emission_configs_completed) emission_configs_remaining = [ item for item in emission_configs_20percentintervals_remaining_set ] print( f"custom outputs remaining for {output}: {len(emission_configs_remaining)} - 20% intervals with {int(100 * len(emission_configs_20percentintervals_remaining_set) / len(emission_configs_20percentintervals))}% remaining" ) # dask bag and process emission_configs_remaining = emission_configs_remaining[:15000] print( f"predicting for {len(emission_configs_remaining)} custom outputs ...") bag_emission_configs = db.from_sequence(emission_configs_remaining, npartitions=n_workers) bag_emission_configs.map(adjust).compute() time_end = time.time() - time_start print( f"completed in {time_end:0.2f} seconds, or {time_end / 60:0.2f} minutes, or {time_end / 3600:0.2f} hours" ) print( f"average time per custom output is {time_end / len(emission_configs_remaining):0.2f} seconds" ) client.close() cluster.close()
#DATA = DATA_PARQUET_10_SNAPPY # note 250MB partitions #columns = ['date', 'pt', 'new'] # more efficient process #client = Client(n_workers=32, threads_per_worker=1, processes=True, memory_limit='3GB') # succeeds in 3s #DATA = DATA_PARQUET_10_SNAPPY # note 250MB partitions #columns = ['date', 'pt', 'new'] # more efficient process #client = Client(n_workers=10, threads_per_worker=1, processes=True, memory_limit='3GB') # succeeds in 3s if __name__ == "__main__": if 'client' not in dir(): #client = Client(n_workers=32, threads_per_worker=1, processes=True, memory_limit='3GB') client = Client(n_workers=10, threads_per_worker=1, processes=True, memory_limit='3GB') #client = Client(n_workers=2, threads_per_worker=1, processes=True, memory_limit='7GB') print(client) columns = ['date', 'pt', 'new'] # more efficient process #columns = None # used in conf talk DATA = DATA_PARQUET_10_SNAPPY #DATA = DATA_PARQUET_10_NOCOMPRESSION #DATA = DATA_PARQUET_100_SNAPPY ddf = dd.read_parquet(DATA, columns=columns) #ddf.head() #%%time t1 = time.time()
def index_f_and_f(dump_pk, user_pk): """ Run all plugin for a new index on dask """ dask_client = Client(settings.DASK_SCHEDULER_URL) fire_and_forget(dask_client.submit(unzip_then_run, dump_pk, user_pk))
def setup_workers(numCore): cluster = setup_cluster() from dask.distributed import Client client = Client(cluster) cluster.start_workers(numCore) return cluster, client
conn.execute('''DROP INDEX IF EXISTS idx_movie2;''') conn.execute('''CREATE INDEX idx_movie1 ON moviePairs(movie1);''') conn.execute('''CREATE INDEX idx_movie2 ON moviePairs(movie2);''') conn.commit() for i in mps.itertuples(name=None): #print(i) conn.execute( '''INSERT INTO moviePairs(movie1,movie2,numPairs,score) VALUES(?,?,?,?);''', (i[0][0], i[0][1], i[2], i[1])) conn.commit() #Main if __name__ == "__main__": client = Client(processes=False) print(client) fileRatings = "ml-1m/ratings.dat" fileNames = "ml-1m/movies.dat" moviePairSimilarities = computeMoviePairSimilarities(fileRatings) conn = sqlite3.connect('movieNamesDask.db') #moviePairsToSQL(conn,moviePairSimilarities) conn.close() #moviePairSimilarities.to_cvs("similarities.json") movieNames = pd.read_csv(fileNames, names=["movieID", "title"],
y_train, validation_data=(x_valid, y_valid), shuffle=True, batch_size=BATCHSIZE, epochs=EPOCHS, verbose=False, ) # Evaluate the model accuracy on the validation set. score = model.evaluate(x_valid, y_valid, verbose=0) return score[1] if __name__ == "__main__": with Client() as client: print(f"Dask dashboard is available at {client.dashboard_link}") storage = dask_optuna.DaskStorage() study = optuna.create_study(storage=storage, direction="maximize") with joblib.parallel_backend("dask"): study.optimize(objective, n_trials=20, n_jobs=-1) print("Number of trials: {}".format(len(study.trials))) print("Best trial:") trial = study.best_trial print(" Value: {}".format(trial.value)) print(" Params: ")
import dask.array from sympy import latex from pathlib import Path from dask.distributed import Client from dask.distributed import LocalCluster import bgc_md2.models.cable_all.cableCache as cC import bgc_md2.models.cable_all.cablePaths as cP import bgc_md2.models.cable_all.cableHelpers as cH if __name__ == "__main__": if "cluster" not in dir(): # cluster = LocalCluster(n_workers = 1) cluster = LocalCluster() client = Client(cluster) # + try: from ports.server_helpers import print_commands print_commands(cluster, local_port=8880) except ImportError as e: pass # module doesn't exist,dont make a fuss # - # chose the cable output directory you want to work with cable_out_path = Path( "/home/data/cable-data/example_runs/parallel_1901_2004_with_spinup/output/new4" )
import os import shutil import time import schemas from dask.distributed import Client import dask.dataframe as dd TABLE_NAME = os.environ['TABLE_NAME'] SOURCE = f'/data/raw/{TABLE_NAME}' DESTINATION = f'/data/default/{TABLE_NAME}' if __name__ == '__main__': # start the client client = Client() # find the files that exist fnames = [f'{SOURCE}/{fname}' for fname in os.listdir(SOURCE)] # read in existing data df = dd.read_json(fnames, dtype=getattr(schemas, TABLE_NAME)) # add a column for partitions df['hour'] = df['timestamp'].dt.floor('h') # write out files start_time = time.time() df.to_parquet(DESTINATION, engine='fastparquet', append=True, compression='gzip', partition_on=['pair', 'hour'],
def full_pipeline_dask(job_name, train_function, eval_function, infer_function, **kwargs): # original training if os.environ.get('FASTMRI_DEBUG'): n_epochs_train = 1 n_epochs_fine_tune = 1 n_eval_samples = 1 n_inference_samples = 1 else: n_epochs_train = 250 n_epochs_fine_tune = 50 n_eval_samples = 50 n_inference_samples = None train_cluster = SLURMCluster( cores=1, job_cpu=20, memory='80GB', job_name=job_name, walltime='80:00:00', interface='ib0', job_extra=[ f'--gres=gpu:1', '--qos=qos_gpu-t4', '--distribution=block:block', '--hint=nomultithread', '--output=%x_%j.out', ], env_extra=[ 'cd $WORK/fastmri-reproducible-benchmark', '. ./submission_scripts_jean_zay/env_config.sh', ], ) train_cluster.scale(2) client = Client(train_cluster) acceleration_factors = [4, 8] futures = [ client.submit( # function to execute train_function, af=af, n_epochs=n_epochs_train, **kwargs, # this function has potential side effects pure=True, ) for af in acceleration_factors ] run_ids = client.gather(futures) # fine tuning train_cluster.scale(4) contrasts = ['CORPDFS_FBK', 'CORPD_FBK'] futures = [] for af, run_id in zip(acceleration_factors, run_ids): for contrast in contrasts: futures += [ client.submit( # function to execute train_function, af=af, contrast=contrast, original_run_id=run_id, n_epochs=n_epochs_fine_tune, **kwargs, # this function has potential side effects pure=True, ) ] fine_tuned_run_ids = client.gather(futures) client.close() train_cluster.close() # inference and eval inference_eval_cluster = SLURMCluster( cores=1, job_cpu=40, memory='80GB', job_name=job_name, walltime='20:00:00', interface='ib0', job_extra=[ f'--gres=gpu:4', '--qos=qos_gpu-t3', '--distribution=block:block', '--hint=nomultithread', '--output=%x_%j.out', ], env_extra=[ 'cd $WORK/fastmri-reproducible-benchmark', '. ./submission_scripts_jean_zay/env_config.sh', ], ) inference_eval_cluster.scale(8) client = Client(inference_eval_cluster) i_run_id = 0 inference_futures = [] eval_futures = [] kwargs.pop('loss') for af in acceleration_factors: for contrast in contrasts: run_id = fine_tuned_run_ids[i_run_id] inference_futures += [ client.submit( # function to execute infer_function, contrast=contrast, af=af, run_id=run_id, n_epochs=n_epochs_fine_tune, n_samples=n_inference_samples, exp_id=job_name, **kwargs, # this function has potential side effects pure=True, ) ] eval_futures += [ client.submit( # function to execute eval_function, contrast=contrast, af=af, run_id=run_id, n_epochs=n_epochs_fine_tune, n_samples=n_eval_samples, **kwargs, # this function has potential side effects pure=True, ) ] i_run_id += 1 client.gather(inference_futures) # eval printing i_run_id = 0 for af in acceleration_factors: for contrast in contrasts: metrics_names, eval_res = client.gather(eval_futures[i_run_id]) print('AF', af) print('Contrast', contrast) print(metrics_names) print(eval_res) i_run_id += 1 print('Shutting down dask workers') client.close() inference_eval_cluster.close()
def run_micro_macro(user_params): # Grab a reform JSON file already in Tax-Calculator # In this example the 'reform' is a change to 2017 law (the # baseline policy is tax law in 2018) reform_url = ('https://raw.githubusercontent.com/' 'PSLmodels/Tax-Calculator/master/taxcalc/' 'reforms/2017_law.json') #ref = Calculator.read_json_param_objects(reform_url, None) # Modified #reform = ref['policy'] # Modified # Define parameters to use for multiprocessing client = Client(processes=False) num_workers = 1 # multiprocessing.cpu_count() print('Number of workers = ', num_workers) start_time = time.time() # Set some model parameters # See parameters.py for description of these parameters alpha_T = np.ones(50) * 0.1230058215 # Modified alpha_G = np.ones(7) * 0.01234569933 # Modified small_open = False user_params = { 'frisch': 0.5, 'start_year': 2018, 'tau_b': [(0.21 * 0.55) * (0.017 / 0.055), (0.21 * 0.55) * (0.017 / 0.055)], 'debt_ratio_ss': 2.0, 'alpha_T': alpha_T.tolist(), 'alpha_G': alpha_G.tolist(), 'small_open': small_open } # modified ''' ------------------------------------------------------------------------ Run baseline policy first ------------------------------------------------------------------------ ''' output_base = BASELINE_DIR kwargs = { 'output_base': output_base, 'baseline_dir': BASELINE_DIR, 'test': False, 'time_path': False, 'baseline': True, 'user_params': user_params, 'guid': '_example', 'run_micro': False, 'data': 'cps', 'client': client, 'num_workers': num_workers } start_time = time.time() runner(**kwargs) print('run time = ', time.time() - start_time) ''' ------------------------------------------------------------------------ Run reform policy ------------------------------------------------------------------------ ''' user_params = { 'frisch': 0.5, 'start_year': 2018, 'tau_b': [(0.35 * 0.55) * (0.017 / 0.055)], 'debt_ratio_ss': 1.0, 'alpha_T': alpha_T.tolist(), 'alpha_G': alpha_G.tolist(), 'small_open': small_open } output_base = REFORM_DIR kwargs = { 'output_base': output_base, 'baseline_dir': BASELINE_DIR, 'test': False, 'time_path': True, 'baseline': False, 'user_params': user_params, 'guid': '_example', 'reform': reform, 'run_micro': True, 'data': 'cps', 'client': client, 'num_workers': num_workers } start_time = time.time() runner(**kwargs) print('run time = ', time.time() - start_time) # return ans - the percentage changes in macro aggregates and prices # due to policy changes from the baseline to the reform ans = postprocess.create_diff(baseline_dir=BASELINE_DIR, policy_dir=REFORM_DIR) print("total time was ", (time.time() - start_time)) print('Percentage changes in aggregates:', ans)
def train_eval_parameter_grid(job_name, train_function, eval_function, parameter_grid): parameters = list(ParameterGrid(parameter_grid)) n_parameters_config = len(parameters) train_cluster = SLURMCluster( cores=1, job_cpu=20, memory='80GB', job_name=job_name, walltime='60:00:00', interface='ib0', job_extra=[ f'--gres=gpu:1', '--qos=qos_gpu-t4', '--distribution=block:block', '--hint=nomultithread', '--output=%x_%j.out', ], env_extra=[ 'cd $WORK/fastmri-reproducible-benchmark', '. ./submission_scripts_jean_zay/env_config.sh', ], ) train_cluster.scale(n_parameters_config) client = Client(train_cluster) futures = [ client.submit( # function to execute train_function, **params, ) for params in parameters ] run_ids = client.gather(futures) client.close() train_cluster.close() # eval eval_cluster = SLURMCluster( cores=1, job_cpu=40, memory='80GB', job_name=job_name, walltime='20:00:00', interface='ib0', job_extra=[ f'--gres=gpu:4', '--qos=qos_gpu-t3', '--distribution=block:block', '--hint=nomultithread', '--output=%x_%j.out', ], env_extra=[ 'cd $WORK/fastmri-reproducible-benchmark', '. ./submission_scripts_jean_zay/env_config.sh', ], ) eval_cluster.scale(n_parameters_config) client = Client(eval_cluster) for params in parameters: params.pop('n_samples') futures = [ client.submit( # function to execute eval_function, run_id=run_id, n_samples=50, **params, ) for run_id, params in zip(run_ids, parameters) ] for params, future in zip(parameters, futures): metrics_names, eval_res = client.gather(future) print('Parameters', params) print(metrics_names) print(eval_res) print('Shutting down dask workers') client.close() eval_cluster.close()
def init(self): ### initializing required classes self._execute_notebook_class = ExecuteNotebookWriter(self) self._make_site_class = MakeSiteWriter(self) self.executedir = self.outdir + '/executed' self.reportdir = self.outdir + '/reports/' self.errordir = self.outdir + "/reports/{}" self.downloadsdir = self.outdir + "/_downloads" self.downloadsExecutedir = self.downloadsdir + "/executed" self.client = None # Check default language is defined in the jupyter kernels def_lng = self.config["jupyter_default_lang"] if def_lng not in self.config["jupyter_kernels"]: self.logger.warning( "Default language defined in conf.py ({}) is not " "defined in the jupyter_kernels in conf.py. " "Set default language to python3" .format(def_lng)) self.config["jupyter_default_lang"] = "python3" # If the user has overridden anything on the command line, set these things which have been overridden. instructions = [] overrides = self.config['jupyter_options'] if overrides: instructions = overrides.split(",") for instruction in instructions: if instruction: if instruction == 'code_only': self.config["jupyter_conversion_mode"] = "code" else: # Fail on unrecognised command. self.logger.warning("Unrecognise command line parameter " + instruction + ", ignoring.") #threads per worker for dask distributed processing if "jupyter_threads_per_worker" in self.config: self.threads_per_worker = self.config["jupyter_threads_per_worker"] #number of workers for dask distributed processing if "jupyter_number_workers" in self.config: self.n_workers = self.config["jupyter_number_workers"] # start a dask client to process the notebooks efficiently. # processes = False. This is sometimes preferable if you want to avoid inter-worker communication and your computations release the GIL. This is common when primarily using NumPy or Dask Array. if (self.config["jupyter_execute_notebooks"]): self.client = Client(processes=False, threads_per_worker = self.threads_per_worker, n_workers = self.n_workers) self.execution_vars = { 'target': 'website', 'dependency_lists': self.config["jupyter_dependency_lists"], 'executed_notebooks': [], 'delayed_notebooks': dict(), 'futures': [], 'delayed_futures': [], 'destination': self.executedir } if (self.config["jupyter_download_nb_execute"]): if self.client is None: self.client = Client(processes=False, threads_per_worker = self.threads_per_worker, n_workers = self.n_workers) self.download_execution_vars = { 'target': 'downloads', 'dependency_lists': self.config["jupyter_dependency_lists"], 'executed_notebooks': [], 'delayed_notebooks': dict(), 'futures': [], 'delayed_futures': [], 'destination': self.downloadsExecutedir }
def client(cluster): with Client(cluster) as client: yield client
def test_empty_dmatrix(self): with LocalCUDACluster() as cluster: with Client(cluster) as client: parameters = {'tree_method': 'gpu_hist'} run_empty_dmatrix(client, parameters)
} exe = processor.futures_executor elif iterative: exe_args = { 'function_args': {'flatten': False}, "schema": NanoAODSchema, } exe = processor.iterative_executor else: from Tools.helpers import get_scheduler_address from dask.distributed import Client, progress scheduler_address = get_scheduler_address() c = Client(scheduler_address) exe_args = { 'client': c, 'function_args': {'flatten': False}, "schema": NanoAODSchema, "tailtimeout": 300, "retries": 3, "skipbadfiles": True } exe = processor.dask_executor # add some histograms that we defined in the processor # everything else is taken the default_accumulators.py from processor.default_accumulators import multiplicity_axis, dataset_axis, score_axis, pt_axis, ht_axis desired_output.update({