def _get_results_by_threading(self, func, params): """ Query github API by multithreading. return a list containing all results. """ num_workers = self.num_workers if func.__name__ not in [ "multi_pulls", "multi_commits", "multi_watchers" ]: num_workers = 1 if self.debug_counts: p = ThPool(num_workers) pool_args = params[:self.debug_counts] return p.map(func, pool_args) else: stats = [] start = time.time() for i in range(int(params.totalCount / self.batch_size) + 1): if self.num_workers != 1 and i != 0 and ( i + 1) * self.batch_size % 800 == 0: print("Sleep 30 sec") sleep(30) p = ThPool(num_workers) temp = p.map( func, params[i * self.batch_size:(i + 1) * self.batch_size]) stats += temp print( f"{self.repo_name}, {func.__name__} takes: {round(time.time() - start, 3)} secs" ) return stats
def begin_processing(self): pool = ThreadPool(nodes=Helper.config('threads')) for course in self.course_data: pool.map(self.download_lesson, course['lessons']) print( '--- Course "{course_title}" has been downloaded, with total of "{lessons_amount}" lessons.' .format(course_title=course['title'], lessons_amount=len(course['lessons']))) time.sleep(Helper.config('sleep'))
def _split_variable(self): """Split by variable.""" outputfiles = [ self._define_outputfilename(var, self.years) for var in self.variables ] years = len(outputfiles) * [self.years] if not self.threads: pool = Pool() else: pool = Pool(nodes=self.threads) pool.map(self._getdata, self.variables, years, outputfiles)
def build( charm_list, layer_list, layer_index, charm_branch, layer_branch, resource_spec, filter_by_tag, to_channel, rebuild_cache, ): build_env = BuildEnv(build_type=BuildType.CHARM) build_env.db["build_args"] = { "artifact_list": charm_list, "layer_list": layer_list, "layer_index": layer_index, "charm_branch": charm_branch, "layer_branch": layer_branch, "resource_spec": resource_spec, "filter_by_tag": list(filter_by_tag), "to_channel": to_channel, "rebuild_cache": rebuild_cache, } build_env.pull_layers() entities = [] for charm_map in build_env.artifacts: for charm_name, charm_opts in charm_map.items(): if not any(match in filter_by_tag for match in charm_opts["tags"]): continue charm_entity = f"cs:~{charm_opts['namespace']}/{charm_name}" entities.append( BuildEntity(build_env, charm_name, charm_opts, charm_entity)) click.echo(f"Queued {charm_entity} for building") def _run_build(build_entity): build_entity.setup() if not build_entity.has_changed: return build_entity.proof_build() build_entity.push() build_entity.attach_resource("unpublished") build_entity.promote(to_channel=to_channel) pool = ThreadPool() pool.map(_run_build, entities) build_env.save()
def _split_variable_yr(self): """Fetch variable split by variable and year.""" outputfiles = [] variables = [] for var in self.variables: outputfiles = [ self._define_outputfilename(var, [yr]) for yr in self.years ] variables += len(outputfiles) * [var] if not self.threads: pool = Pool() else: pool = Pool(nodes=self.threads) pool.map(self._getdata, variables, self.years, outputfiles)
def _get_results_by_threading(self, func, params): """ Query github API by multithreading. return a list containing all results. """ num_workers = self.num_workers if func.__name__ not in [ "multi_pulls", "multi_commits", "multi_watchers" ]: num_workers = 1 stats = [] start = time.time() for i in range(len(params) // NUM_PER_PAGE): # pdb.set_trace() if self.num_workers != 1 and (i == 0 or (i + 1) * NUM_PER_PAGE % 400 == 0): sec = random.choice(range(10, 60)) print("Sleep {} sec".format(sec)) sleep(sec) p = ThPool(num_workers) temp = p.map(func, params[i * NUM_PER_PAGE:(i + 1) * NUM_PER_PAGE]) stats += temp print( f"{self.repo_name}, {func.__name__} takes: {round(time.time()-start,3)} secs" ) return stats
def pull_layers(self): """ clone all downstream layers to be processed locally when doing charm builds """ if self.rebuild_cache: click.echo("- rebuild cache triggered, cleaning out cache.") shutil.rmtree(str(self.layers_dir)) shutil.rmtree(str(self.interfaces_dir)) os.mkdir(str(self.layers_dir)) os.mkdir(str(self.interfaces_dir)) layers_to_pull = [] for layer_map in self.layers: layer_name = list(layer_map.keys())[0] if layer_name == "layer:index": continue layers_to_pull.append(layer_name) pool = ThreadPool() pool.map(self.download, layers_to_pull) self.db["pull_layer_manifest"] = [] _paths_to_process = { "layer": glob("{}/*".format(str(self.layers_dir))), "interface": glob("{}/*".format(str(self.interfaces_dir))), } for prefix, paths in _paths_to_process.items(): for _path in paths: build_path = _path if not build_path: raise BuildException( f"Could not determine build path for {_path}") git.checkout(self.layer_branch, _cwd=build_path) layer_manifest = { "rev": git("rev-parse", "HEAD", _cwd=build_path).stdout.decode().strip(), "url": f"{prefix}:{Path(build_path).stem}", } self.db["pull_layer_manifest"].append(layer_manifest) click.echo( f"- {layer_manifest['url']} at commit: {layer_manifest['rev']}" )
def data_func(measurement): if not use_threads: data = numpy.full(sources.shape + geobox.shape, measurement['nodata'], dtype=measurement['dtype']) for index, datasets in numpy.ndenumerate(sources.values): _fuse_measurement( data[index], datasets, geobox, measurement, fuse_func=fuse_func, skip_broken_datasets=skip_broken_datasets, driver_manager=driver_manager) else: def work_load_data(array_name, index, datasets): data = sa.attach(array_name) _fuse_measurement( data[index], datasets, geobox, measurement, fuse_func=fuse_func, skip_broken_datasets=skip_broken_datasets, driver_manager=driver_manager) array_name = '_'.join( ['DCCORE', str(uuid.uuid4()), str(os.getpid())]) sa.create(array_name, shape=sources.shape + geobox.shape, dtype=measurement['dtype']) data = sa.attach(array_name) data[:] = measurement['nodata'] pool = ThreadPool(32) pool.map(work_load_data, repeat(array_name), *zip(*numpy.ndenumerate(sources.values))) sa.delete(array_name) return data
def pull_layers(self): """clone all downstream layers to be processed locally when doing charm builds""" layers_to_pull = [] for layer_map in self.layers: layer_name = list(layer_map.keys())[0] if layer_name == "layer:index": continue layers_to_pull.append(layer_name) pool = ThreadPool() results = pool.map(self.download, layers_to_pull) self.db["pull_layer_manifest"] = [result for result in results]
def filter_results(self, im_array, results, image_times, model, psf_sigma=1.0, batch_size=32, chunk_size=10000): """ Use a keras neural network model to detect real objects based upon the coadded postage stamps of those objects. Filter and keep only actual objects going forward. Parameters ---------- im_array: numpy array, required The masked original images. See loadMaskedImages in searchImage.py. results_arr: numpy recarray, required The results output from findObjects in searchImage. image_times: numpy array, required An array containing the image times in DAYS with the first image at time 0. Note: This is different than other methods so the units of this may change. Watch this documentation. model: keras model, required A previously trained model loaded from an hdf5 file. batch_size: int Batch size for keras predict. Returns ------- filtered_results: numpy array An edited version of results_arr with only the rows where true objects were classified. """ keep_objects = np.array([]) total_chunks = np.ceil(len(results) / float(chunk_size)) chunk_num = 1 circle_vals = [] enumerated_results = list(enumerate(results)) self.im_array = im_array self.image_times = image_times self.psf_sigma = psf_sigma # for chunk_start in range(0, len(results), chunk_size): # test_class = [] # p_stamp_arr = [] # #circle_chunk = [] # for imNum in range(chunk_start, chunk_start+chunk_size): # try: # p_stamp = self.createPostageStamp(im_array, # list(results[['t0_x', 't0_y']][imNum]), # np.array(list(results[['v_x', 'v_y']][imNum])), # image_times, [25., 25.])[0] # p_stamp = np.array(p_stamp) # p_stamp[np.isnan(p_stamp)] = 0. # p_stamp[np.isinf(p_stamp)] = 0. # #p_stamp -= np.min(p_stamp) # #p_stamp /= np.max(p_stamp) # #p_stamp # image_thresh = np.max(p_stamp)*0.5 # image = (p_stamp > image_thresh)*1. # #pre_image = p_stamp > image_thresh # #image = np.array(pre_image*1.) # mom = measure.moments(image) # cr = mom[0,1]/mom[0,0] # cc = mom[1,0]/mom[0,0] # #moments = measure.moments(image, order=3) # #cr = moments[0,1]/moments[0,0] # #cc = moments[1,0]/moments[0,0] # cent_mom = measure.moments_central(image, cr, cc, order=4) # norm_mom = measure.moments_normalized(cent_mom) # hu_mom = measure.moments_hu(norm_mom) # #p_stamp_arr.append(hu_mom) # #print moments[0,0], measure.perimeter(image) # #circularity = (4*np.pi*moments[0,0])/(measure.perimeter(image)**2.) # #circularity = (cent_mom[0,0]**2.)/(2.*np.pi*(cent_mom[2,0] + cent_mom[0,2])) # circularity = (1/(2.*np.pi))*(1/hu_mom[0]) # #circularity = (cent_mom[0,0]**2.)/(2*np.pi*(cent_mom[2,0] + cent_mom[0,2])) # psf_sigma = psf_sigma # gaussian_fwhm = psf_sigma*2.35 # fwhm_area = np.pi*(gaussian_fwhm/2.)**2. # #print circularity, cr, cc # if ((circularity > 0.6) & (cr > 10.) & (cr < 14.) & (cc > 10.) & (cc < 14.) & # (cent_mom[0,0] < (9.0*fwhm_area)) & (cent_mom[0,0] > 3.0)): #Use 200% error margin on psf_sigma for now # # test_class.append(1.) # # print circularity, cr, cc, moments[0,0] # #else: # # test_class.append(0.) # test_class.append(1.) # else: # test_class.append(0.) # circle_vals.append([circularity, cr, cc, cent_mom[0,0], image_thresh]) # #print circularity, cr, cc, cent_mom[0,0], image_thresh # except: # #p_stamp_arr.append(np.ones((25, 25))) # p_stamp_arr.append(np.zeros(7)) # test_class.append(0.) # circle_vals.append([0., 0., 0., 0., 0.]) # continue # p_stamp_arr = np.array(p_stamp_arr)#.reshape(chunk_size, 625) #test_class = model.predict_classes(p_stamp_arr, batch_size=batch_size, # verbose=1) pool = Pool(nodes=8) test_classes = pool.map(self.circularity_test, enumerated_results) test_classes = np.array(test_classes).T keep_idx = test_classes[0][np.where( np.array(test_classes[1]) > .5)] # + chunk_start print keep_idx #print np.where(np.array(test_class) > .5) print test_classes[0][np.where(np.array(test_classes[1]) > .5)] keep_objects = keep_idx #np.append(keep_objects, keep_idx) #circle_vals[keep_idx] = np.array(circle_chunk) print "Finished chunk %i of %i" % (chunk_num, total_chunks) chunk_num += 1 # keep_objects = np.arange(len(results)) filtered_results = results[np.array(keep_objects, dtype=np.int)] #circle_vals = np.array(circle_vals) #circle_vals_keep = circle_vals[np.array(keep_objects, dtype=np.int)] return filtered_results #, circle_vals_keep
def create_storage(coords, geobox, measurements, data_func=None, use_threads=False): """ Create a :class:`xarray.Dataset` and (optionally) fill it with data. This function makes the in memory storage structure to hold datacube data, loading data from datasets that have been grouped appropriately by :meth:`group_datasets`. :param dict coords: OrderedDict holding `DataArray` objects defining the dimensions not specified by `geobox` :param GeoBox geobox: A GeoBox defining the output spatial projection and resolution :param measurements: list of :class:`datacube.model.Measurement` :param data_func: function to fill the storage with data. It is called once for each measurement, with the measurement as an argument. It should return an appropriately shaped numpy array. If not provided, an empty :class:`xarray.Dataset` is returned. :param bool use_threads: Optional. If this is set to True, IO will be multi-thread. May not work for all drivers due to locking/GIL. Default is False. :rtype: :class:`xarray.Dataset` .. seealso:: :meth:`find_datasets` :meth:`group_datasets` """ def empty_func(measurement_): coord_shape = tuple(coord_.size for coord_ in coords.values()) return numpy.full(coord_shape + geobox.shape, measurement_.nodata, dtype=measurement_.dtype) data_func = data_func or empty_func result = xarray.Dataset(attrs={'crs': geobox.crs}) for name, coord in coords.items(): result[name] = coord for name, coord in geobox.coordinates.items(): result[name] = (name, coord.values, {'units': coord.units}) def work_measurements(measurement, data_func): return data_func(measurement) use_threads = use_threads and THREADING_REQS_AVAILABLE if use_threads: pool = ThreadPool(32) results = pool.map(work_measurements, measurements, repeat(data_func)) else: results = [data_func(a) for a in measurements] for measurement in measurements: data = results.pop(0) attrs = measurement.dataarray_attrs() attrs['crs'] = geobox.crs dims = tuple(coords.keys()) + tuple(geobox.dimensions) result[measurement.name] = (dims, data, attrs) return result
x = hub.array(shape, name='imagenet/test:latest', dtype='uint8') print(x.shape) index = 1 def upload_val(index): t1 = time.time() # Preprocess the image img = Image.open(val_path[index]) img = img.resize((500, 375), Image.ANTIALIAS) img = np.asarray(img) if len(img.shape) == 2: img = np.expand_dims(img, -1) if img.shape[-1] == 4: img = img[..., :3] img = np.transpose(img, axes=(1, 0, 2)) # Upload the image t2 = time.time() x[index] = np.expand_dims(img, 0) t3 = time.time() print("uploading {}/{}: downloded in {}s and uploaded in {}s ".format( index, len(val_path), t2 - t1, t3 - t2)) t1 = time.time() list(pool.map(upload_val, list(range(len(val_path))))) t2 = time.time() print('uploaded {} images in {}s'.format(len(val_path), t2 - t1))
from pathos.threading import ThreadPool import time pool = ThreadPool(nodes=4) # do a blocking map on the chosen function print(pool.map(pow, [1,2,3,4], [5,6,7,8])) # do a non-blocking map, then extract the results from the iterator results = pool.imap(pow, [1,2,3,4], [5,6,7,8]) print("...") print(list(results)) # do an asynchronous map, then get the results results = pool.amap(pow, [1,2,3,4], [5,6,7,8]) while not results.ready(): time.sleep(5) print(".") print(results.get()) # do one item at a time, using a pipe print(pool.pipe(pow, 1, 5)) print(pool.pipe(pow, 2, 6)) # do one item at a time, using an asynchronous pipe result1 = pool.apipe(pow, 1, 5) result2 = pool.apipe(pow, 2, 6)
def tuneHyperParameters(simsettingsFileName, hyperSettings=None, saved_fd_model_path=None): """ For some set of parameters the function will sample a number of them In order to find a more optimal configuration. """ import os result_data = {} file = open(simsettingsFileName) settings = json.load(file) print("Settings: " + str(json.dumps(settings, indent=4))) file.close() file = open(hyperSettings) hyper_settings = json.load(file) print("Settings: " + str(json.dumps(settings, indent=4))) file.close() num_sim_samples = hyper_settings['meta_sim_samples'] ## Check to see if there exists a saved fd model, if so save the path in the hyper settings if (not (saved_fd_model_path is None)): directory = getDataDirectory(settings) # file_name_dynamics=directory+"forward_dynamics_"+"_Best_pretrain.pkl" if not os.path.exists(directory): hyper_settings['saved_fd_model_path'] = saved_fd_model_path param_settings = get_param_values(hyper_settings) result_data['hyper_param_settings_files'] = [] sim_data = [] data_name = settings['data_folder'] for params in param_settings: ## Loop over each setting of parameters data_name_tmp = "" for par in range( len(params) ): ## Assemble the vector of parameters and data folder name param_of_interest = hyper_settings['param_to_tune'][par] data_name_tmp = data_name_tmp + "/_" + param_of_interest + "_" + str( params[par]) + "/" settings[param_of_interest] = params[par] settings['data_folder'] = data_name + data_name_tmp directory = getBaseDataDirectory(settings) if not os.path.exists(directory): os.makedirs(directory) # file = open(settingsFileName, 'r') out_file_name = directory + os.path.basename(simsettingsFileName) result_data['hyper_param_settings_files'].append(out_file_name) print("Saving settings file with data to: ", out_file_name) print("settings['data_folder']: ", settings['data_folder']) out_file = open(out_file_name, 'w') out_file.write(json.dumps(settings, indent=4)) # file.close() out_file.close() sim_data.append( (simsettingsFileName, num_sim_samples, copy.deepcopy(settings), hyper_settings['meta_sim_threads'], copy.deepcopy(hyper_settings))) # p = ProcessingPool(2) p = ThreadPool(hyper_settings['tuning_threads']) t0 = time.time() result = p.map(_trainMetaModel, sim_data) t1 = time.time() print("Hyper parameter tuning complete in " + str(datetime.timedelta(seconds=(t1 - t0))) + " seconds") result_data['sim_time'] = "Meta model training complete in " + str( datetime.timedelta(seconds=(t1 - t0))) + " seconds" result_data['meta_sim_result'] = result result_data['raw_sim_time_in_seconds'] = t1 - t0 result_data['Number_of_simulations_sampled'] = len(param_settings) result_data['Number_of_threads_used'] = hyper_settings['tuning_threads'] print(result) return result_data
def create_storage(coords, geobox, measurements, data_func=None, use_threads=False): """ Create a :class:`xarray.Dataset` and (optionally) fill it with data. This function makes the in memory storage structure to hold datacube data, loading data from datasets that have been grouped appropriately by :meth:`group_datasets`. :param dict coords: OrderedDict holding `DataArray` objects defining the dimensions not specified by `geobox` :param GeoBox geobox: A GeoBox defining the output spatial projection and resolution :param measurements: list of measurement dicts with keys: {'name', 'dtype', 'nodata', 'units'} :param data_func: function to fill the storage with data. It is called once for each measurement, with the measurement as an argument. It should return an appropriately shaped numpy array. :param bool use_threads: Optional. If this is set to True, IO will be multi-thread. May not work for all drivers due to locking/GIL. Default is False. :rtype: :class:`xarray.Dataset` .. seealso:: :meth:`find_datasets` :meth:`group_datasets` """ def empty_func(measurement_): coord_shape = tuple(coord_.size for coord_ in coords.values()) return numpy.full(coord_shape + geobox.shape, measurement_['nodata'], dtype=measurement_['dtype']) data_func = data_func or empty_func result = xarray.Dataset(attrs={'crs': geobox.crs}) for name, coord in coords.items(): result[name] = coord for name, coord in geobox.coordinates.items(): result[name] = (name, coord.values, {'units': coord.units}) def work_measurements(measurement, data_func): return data_func(measurement) if use_threads and ('SharedArray' not in sys.modules or 'pathos.threading' not in sys.modules): use_threads = False if use_threads: pool = ThreadPool(32) results = pool.map(work_measurements, measurements, repeat(data_func)) else: results = [data_func(a) for a in measurements] for measurement in measurements: data = results.pop(0) attrs = { 'nodata': measurement.get('nodata'), 'units': measurement.get('units', '1'), 'crs': geobox.crs } if 'flags_definition' in measurement: attrs['flags_definition'] = measurement['flags_definition'] if 'spectral_definition' in measurement: attrs['spectral_definition'] = measurement[ 'spectral_definition'] dims = tuple(coords.keys()) + tuple(geobox.dimensions) result[measurement['name']] = (dims, data, attrs) return result
def process(self): """ process rules """ pool = ThreadPool() pool.map(self.__process, self.files_to_process)
def main(): main_dir = Path(r'E:\dwd_meteo') os.chdir(main_dir) out_dir = Path(r'zipped_DWD_data') test_exist_dir = Path('extracted') main_site = r'https://opendata.dwd.de' out_dir_names = [ 'hist_daily_met', 'pres_daily_met', 'hist_daily_more_precip', 'pres_daily_more_precip', 'hist_daily_soil_temp', 'pres_daily_soil_temp', 'daily_solar', 'hist_hourly_precip', 'pres_hourly_precip', 'hist_hourly_temp', 'pres_hourly_temp', 'hist_hourly_cloud_type', 'pres_hourly_cloud_type', 'hist_hourly_cloudiness', 'pres_hourly_cloudiness', 'hist_hourly_pressure', 'pres_hourly_pressure', 'hist_hourly_soil_temp', 'pres_hourly_soil_temp', 'hourly_solar', 'hist_hourly_sun', 'pres_hourly_sun', 'hist_hourly_visib', 'pres_hourly_visib', ] sub_links = [ r'/climate_environment/CDC/observations_germany/climate/daily/kl/historical/', r'/climate_environment/CDC/observations_germany/climate/daily/kl/recent/', r'/climate_environment/CDC/observations_germany/climate/daily/more_precip/historical/', r'/climate_environment/CDC/observations_germany/climate/daily/more_precip/recent/', r'/climate_environment/CDC/observations_germany/climate/daily/soil_temperature/historical/', r'/climate_environment/CDC/observations_germany/climate/daily/soil_temperature/recent/', r'/climate_environment/CDC/observations_germany/climate/daily/solar/', r'/climate_environment/CDC/observations_germany/climate/hourly/precipitation/historical/', r'/climate_environment/CDC/observations_germany/climate/hourly/precipitation/recent/', r'/climate_environment/CDC/observations_germany/climate/hourly/air_temperature/historical/', r'/climate_environment/CDC/observations_germany/climate/hourly/air_temperature/recent/', r'/climate_environment/CDC/observations_germany/climate/hourly/cloud_type/historical/', r'/climate_environment/CDC/observations_germany/climate/hourly/cloud_type/recent/', r'/climate_environment/CDC/observations_germany/climate/hourly/cloudiness/historical/', r'/climate_environment/CDC/observations_germany/climate/hourly/cloudiness/recent/', r'/climate_environment/CDC/observations_germany/climate/hourly/pressure/historical/', r'/climate_environment/CDC/observations_germany/climate/hourly/pressure/recent/', r'/climate_environment/CDC/observations_germany/climate/hourly/soil_temperature/historical/', r'/climate_environment/CDC/observations_germany/climate/hourly/soil_temperature/recent/', r'/climate_environment/CDC/observations_germany/climate/hourly/solar/', r'/climate_environment/CDC/observations_germany/climate/hourly/sun/historical/', r'/climate_environment/CDC/observations_germany/climate/hourly/sun/recent/', r'/climate_environment/CDC/observations_germany/climate/hourly/visibility/historical/', r'/climate_environment/CDC/observations_germany/climate/hourly/visibility/recent/', ] assert len(out_dir_names) == len(sub_links) out_dir.mkdir(exist_ok=True) n_threads = len(out_dir_names) if n_threads == 1: for i in range(len(out_dir_names)): download_data(main_site + sub_links[i], out_dir / out_dir_names[i], test_exist_dir) else: thread_pool = ThreadPool(nodes=n_threads) thread_pool.map( download_data, [main_site + sub_link for sub_link in sub_links], [out_dir / out_dir_name for out_dir_name in out_dir_names], [test_exist_dir] * n_threads) return