acs_event_gen, n_surgery, background_surgery_gen, background_surgery_duration_gen, bg_scale, target_scale, simulation_time, use_queueing=queue, ) sim_stats = dept_des.get_queue_statistics(sim_res) sim_stats["BG_SCALE"] = bg_scale sim_stats["TARGET_SCALE"] = target_scale sim_stats["N_SURG"] = n_surgery print(run_id, sim_stats) return sim_stats if __name__ == "__main__": total_log = [] for tg_s in [0.5, 1.0, 1.5, 2.0]: for bg_s in [0.5, 1.0, 1.5, 2.0]: for nps in [1, 2, 3, 4, 5]: run_res = joblib.Parallel(n_jobs=6)(joblib.delayed( single_experiment_run)(tg_s, bg_s, nps, True, i_run) for i_run in range(100)) total_log.extend(run_res) total_log_df = pd.DataFrame(total_log, columns=total_log[0].keys()) total_log_df.to_csv("logs" + ps + "queue-stats-" + datetime.datetime.now().strftime("%Y-%m-%d-%H-%M-%S") + ".csv")
def get_X(paths): delayed_get_descriptors = \ joblib.delayed(di.descriptors.cached_get_descriptors) X = joblib.Parallel(n_jobs=-1, verbose=10)( delayed_get_descriptors(path) for path in paths) return np.vstack(X)
def create_archipelago( unknowns: list, optimizers: list, optimizers_kwargs: list, pg_problem: pygmo.problem, rel_pop_size: float, archipelago_kwargs: dict, log_each_nth_gen: int, report_level: int, ) -> PyfoombArchipelago: """ Helper method for parallelized estimation using the generalized island model. Creates the archipelago object for running several rounds of evolutions. Arguments --------- unknowns : list The unknowns, sorted alphabetically and case-insensitive. optimizers : list A list of optimizers to be used on individual islands. optimizers_kwargs : list A list of corresponding kwargs. pg_problem : pygmo.problem An pygmo problem instance. archipelago_kwargs : dict Additional kwargs for archipelago creation. log_each_nth_gen : int Specifies at which each n-th generation the algorithm stores logs. report_level : int Prints information on the archipelago creation for values >= 1. Returns ------- archipelago : PyfoombArchipelago """ _cpus = joblib.cpu_count() # There is one optimizer with a set of kwargs if len(optimizers) == 1 and len(optimizers_kwargs) == 1: optimizers = optimizers * _cpus optimizers_kwargs = optimizers_kwargs * _cpus # Several optimizers with the same kwargs elif len(optimizers) > 1 and len(optimizers_kwargs) == 1: optimizers_kwargs = optimizers_kwargs * len(optimizers) # Several kwargs for the same optimizer elif len(optimizers) == 1 and len(optimizers_kwargs) > 1: optimizers = optimizers * len(optimizers_kwargs) elif len(optimizers) != len(optimizers_kwargs): raise ValueError( 'Number of optimizers does not match number of corresponding kwarg dicts' ) # Get the optimizer intances algos = [ PygmoOptimizers.get_optimizer_algo_instance( name=_optimizers, kwargs=_optimizers_kwargs) for _optimizers, _optimizers_kwargs in zip(optimizers, optimizers_kwargs) ] # Update number of islands n_islands = len(algos) if report_level >= 1: print( f'Creating archipelago with {n_islands} islands. May take some time...' ) pop_size = int(numpy.ceil(rel_pop_size * len(unknowns))) prop_create_args = ((pg_problem, pop_size, seed * numpy.random.randint(0, 1e4)) for seed, pop_size in enumerate([pop_size] * n_islands)) try: parallel_verbose = 0 if report_level == 0 else 1 with joblib.parallel_backend('loky', n_jobs=n_islands): pops = joblib.Parallel(verbose=parallel_verbose)(map( joblib.delayed( ArchipelagoHelpers.parallel_create_population), prop_create_args)) except Exception as ex: print( f'Parallelized archipelago creation failed, falling back to sequential\n{ex}' ) pops = ( ArchipelagoHelpers.parallel_create_population(prop_create_arg) for prop_create_arg in prop_create_args) # Now create the empyty archipelago if not 't' in archipelago_kwargs.keys(): archipelago_kwargs['t'] = pygmo.fully_connected() archi = PyfoombArchipelago(**archipelago_kwargs) archi.set_migrant_handling(pygmo.migrant_handling.preserve) # Add the populations to the archipelago and wait for its construction with contextlib.redirect_stdout(io.StringIO()): for _pop, _algo in zip(pops, algos): if log_each_nth_gen is not None: _algo.set_verbosity(int(log_each_nth_gen)) _island = pygmo.island(algo=_algo, pop=_pop, udi=pygmo.mp_island()) archi.push_back(_island) archi.wait_check() return archi
jl.dump((sampler_name, sampler), sampler_file, compress=4) return sampler_file def filter_file_list(data_files): new_list = [] for data_file in data_files: sampler_name = splitext(basename(data_file))[0] sampler_file = 'ramp_fits/sampler/{:s}.gz'.format(sampler_name) if exists(sampler_file): print('Skipping %s' % data_file) else: new_list.append(data_file) return new_list if __name__ == "__main__": if len(sys.argv) == 1: from glob import glob data_files = glob('data/ramp_data/*.csv') else: data_files = sys.argv[1:] data_files = filter_file_list(data_files) print('Number of data files: %g' % len(data_files)) with jl.Parallel(n_jobs=4, verbose=20) as par: par( jl.delayed(process_input_file)(data_file) for data_file in data_files)
def _train(self, dataset): pmeasure = ProxyMeasure( self.lrn, postproc=BinaryFxNode(self.errorfx, self.lrn.space), skip_train=not self. train_pmeasure # do not train since fmeasure will ) # First we need to replicate our RFE construct but this time # with pmeasure for the classifier rfe = RFE( self.fmeasure, pmeasure, Splitter('partitions'), fselector=self.fselector, bestdetector=None, train_pmeasure=self.train_pmeasure, stopping_criterion=None, # full "track" update_sensitivity=self.update_sensitivity, enable_ca=['errors', 'nfeatures']) errors, nfeatures = [], [] if __debug__: debug("RFEC", "Stage 1: initial nested CV/RFE for %s", (dataset, )) if self.nproc != 1 and externals.exists('joblib'): nested_results = jl.Parallel(self.nproc)( jl.delayed(_process_partition)(rfe, partition) for partition in self.partitioner.generate(dataset)) else: nested_results = [ _process_partition(rfe, partition) for partition in self.partitioner.generate(dataset) ] # unzip errors = [x[0] for x in nested_results] nfeatures = [x[1] for x in nested_results] self.ca.nested_nfeatures = nfeatures self.ca.nested_errors = errors # mean errors across splits and find optimal number errors_mean = np.mean(errors, axis=0) nfeatures_mean = np.mean(nfeatures, axis=0) # we will take the "mean location" of the min to stay # within the most 'stable' choice mins_idx = np.where(errors_mean == np.min(errors_mean))[0] min_idx = mins_idx[int(len(mins_idx) / 2)] min_error = errors_mean[min_idx] assert (min_error == np.min(errors_mean)) nfeatures_min = nfeatures_mean[min_idx] if __debug__: debug( "RFEC", "Choosing among %d choices to have %d features with " "mean error=%.2g (initial mean error %.2g)", (len(mins_idx), nfeatures_min, min_error, errors_mean[0])) self.nfeatures_min = nfeatures_min if __debug__: debug( "RFEC", "Stage 2: running RFE on full training dataset to " "obtain the best %d features" % nfeatures_min) super(SplitRFE, self)._train(dataset)
continue try: B, F, T = mlab.specgram( d[i, st:et], NFFT=128, Fs=500000, # 500kHz window=mlab.window_hanning, noverlap=126) # get B[2:34, :] --> [32, 8270] B = B[2:34, :] B_all.append(B) except: pass B_all = np.dstack(B_all) # 3D array B_all /= 40000 # ad-hoc normalizatoin print('current_loc:', current_loc, [B_all.max(), B_all.min()], [bat1x, bat1y, bat1z]) np.save(path + "/trueXYZ_" + '{:09d}'.format(current_loc), np.array([bat1x, bat1y, bat1z])) np.save(path + "/specgram_" + '{:09d}'.format(current_loc), B_all) results = joblib.Parallel(n_jobs=-1)([ joblib.delayed(make_specgram)(current_loc) for current_loc in np.arange(1606200, 46546200, 1500) ])
def fit(self, train_data, test_data=None): """Fits `self.learners` using folds sampled from the provided data. Args: train_data (Table): table to sample train folds test_data (Optional[Table]): tap to sample test folds of None then `train_data` will be used """ test_data = test_data or train_data self.setup_indices(train_data, test_data) self.prepare_arrays(test_data) self._prepare_arrays(test_data) n_callbacks = len(self.learners) * len(self.indices) n_jobs = max(1, min(self.n_jobs, n_callbacks)) def _is_picklable(obj): try: return bool(pickle.dumps(obj)) except (AttributeError, TypeError, pickle.PicklingError): return False if n_jobs > 1 and not all( _is_picklable(learner) for learner in self.learners): n_jobs = 1 warnings.warn( "Not all arguments (learners) are picklable. " "Setting n_jobs=1", OrangeWarning) if n_jobs > 1 and mp.current_process().daemon: n_jobs = 1 warnings.warn( "Worker subprocesses cannot spawn new worker " "subprocesses (e.g. parameter tuning with internal " "cross-validation). Setting n_jobs=1", OrangeWarning) # Workaround for NumPy locking on Macintosh and Ubuntu 14.04 LTS # May be removed once offending libs and OSes are nowhere to be found. # https://pythonhosted.org/joblib/parallel.html#bad-interaction-of-multiprocessing-and-third-party-libraries mp_ctx = mp.get_context('forkserver' if sys.platform.startswith(( 'darwin', 'linux')) and n_jobs > 1 else None) if (n_jobs > 1 and mp_ctx.get_start_method() != 'fork' and train_data.X.size < self._MIN_NJOBS_X_SIZE): n_jobs = 1 warnings.warn( "Working with small-enough data; single-threaded " "sequential excecution will (probably) be faster. " "Setting n_jobs=1", OrangeWarning) try: # Use context-adapted Queue or just the regular Queue if no # multiprocessing (otherwise it shits itself at least on Windos) mp_queue = mp_ctx.Manager().Queue() if n_jobs > 1 else mp.Queue() except (EOFError, RuntimeError): mp_queue = mp.Queue() n_jobs = 1 warnings.warn( ''' Can't run multiprocessing code without a __main__ guard. Multiprocessing strategies 'forkserver' (used by Orange's evaluation methods by default on Mac OS X) and 'spawn' (default on Windos) require the main code entry point be guarded with: if __name__ == '__main__': import multiprocessing as mp mp.freeze_support() # Needed only on Windos ... # Rest of your code ... # See: https://docs.python.org/3/library/__main__.html Otherwise, as the module is re-imported in another process, infinite recursion ensues. Guard your executed code with above Python idiom, or pass n_jobs=1 to evaluation methods, i.e. {}(..., n_jobs=1). Setting n_jobs to 1. '''.format(self.__class__.__name__), OrangeWarning) data_splits = ((fold_i, self.preprocessor(train_data[train_i]), test_data[test_i]) for fold_i, (train_i, test_i) in enumerate(self.indices)) args_iter = ( (fold_i, train_data, test_data, learner_i, learner, self.store_models, mp_queue) # NOTE: If this nested for loop doesn't work, try # itertools.product for (fold_i, train_data, test_data) in data_splits for (learner_i, learner) in enumerate(self.learners)) def _callback_percent(n_steps, queue): """Block until one of the subprocesses completes, before signalling callback with percent""" for percent in np.linspace(.0, .99, n_steps + 1)[1:]: queue.get() try: self._callback(percent) except Exception: # Callback may error for whatever reason (e.g. PEBKAC) # In that case, rather gracefully continue computation # instead of failing pass results = [] with joblib.Parallel(n_jobs=n_jobs, backend=mp_ctx) as parallel: tasks = (joblib.delayed(_mp_worker)(*args) for args in args_iter) # Start the tasks from another thread ... thread = Thread(target=lambda: results.append(parallel(tasks))) thread.start() # ... so that we can update the GUI (callback) from the main thread _callback_percent(n_callbacks, mp_queue) thread.join() results = sorted(results[0]) ptr, prev_fold_i, prev_n_values = 0, 0, 0 for res in results: if res.fold_i != prev_fold_i: ptr += prev_n_values prev_fold_i = res.fold_i result_slice = slice(ptr, ptr + res.n_values) prev_n_values = res.n_values if res.failed: self.failed[res.learner_i] = res.failed continue if self.store_models: self.models[res.fold_i][res.learner_i] = res.model self.predicted[res.learner_i][result_slice] = res.values if train_data.domain.has_discrete_class: self.probabilities[res.learner_i][result_slice, :] = res.probs self._callback(1) return self
def tune(self, ncores=1, csvname=None, verbose=True): """ This function starts the tuning process with specified number of processors :param nthreads: (int) number of parallel threads (see the **Notes** section below for an important note about parallel execution) :param csvname: (str) the name of the csv file name to save the tuning results (useful for expensive cases as the csv file is updated directly after the case is done) :param verbose: (bool) whether to print updates to the screen or not """ self.ncores = ncores self.csvlogger = csvname self.verbose = verbose if self.verbose: print( '***************************************************************' ) print( '****************Bayesian Search is Running*********************' ) print( '***************************************************************' ) if self.ncores > 1: print( '--- Running in parallel with {} threads and {} cases per threads' .format(self.ncores, self.ncases)) print('--- Total number of executed cases is {}*{}={} cases'. format(self.ncores, self.ncases, self.ncores * self.ncases)) if self.ncores > 1: with joblib.Parallel(n_jobs=self.ncores) as parallel: x_vals, func_vals = zip(*parallel( joblib.delayed(self.worker)(core + 1) for core in range(self.ncores))) #flatten the x-lists for all cores x_vals_flatten = [] for lists in x_vals: for item in lists: x_vals_flatten.append(item) #flatten the y results from all cores func_vals_flatten = [ item for sublist in func_vals for item in sublist ] assert len(func_vals_flatten) == len( x_vals_flatten ), '--error: the length of func_vals_flatten and x_vals_flatten in parallel Bayesian search must be equal' self.bayesres = pd.DataFrame(x_vals_flatten, columns=self.func_args) self.bayesres['score'] = np.array( func_vals_flatten ) if self.mode == 'min' else -np.array(func_vals_flatten) else: if self.mode == 'min': @use_named_args(dimensions=self.dimensions) def fitness_wrapper(*args, **kwargs): return self.fit(*args, **kwargs) else: @use_named_args(dimensions=self.dimensions) def fitness_wrapper(*args, **kwargs): return -self.fit(*args, **kwargs) #Single core search self.search_result = gp_minimize( func=fitness_wrapper, dimensions=self.dimensions, acq_func='EI', # Expected Improvement. n_calls=self.ncases, random_state=self.seed) self.bayesres = pd.DataFrame(self.search_result.x_iters, columns=self.func_args) self.bayesres[ 'score'] = self.search_result.func_vals if self.mode == 'min' else -self.search_result.func_vals self.bayesres.index += 1 if self.csvlogger: self.bayesres.index.name = 'id' self.bayesres.to_csv(self.csvlogger) return self.bayesres
def fit(self, X, y, check_input=True): """Fit model with coordinate descent. Parameters ----------- X : ndarray or scipy.sparse matrix, (n_samples, n_features) Data y : ndarray, shape (n_samples,) or (n_samples, n_targets) Target check_input : boolean, (default=True) Allow to bypass several input checking. Don't use this parameter unless you know what you do. Notes ----- Coordinate descent is an algorithm that considers each column of data at a time hence it will automatically convert the X input as a Fortran-contiguous numpy array if necessary. To avoid memory re-allocation it is advised to allocate the initial data in memory directly using that format. """ if self.alpha == 0: warnings.warn( "With alpha=0, this algorithm does not converge " "well. You are advised to use the LinearRegression " "estimator", stacklevel=2, ) if isinstance(self.precompute, six.string_types): raise ValueError("precompute should be one of True, False or" " array-like. Got %r" % self.precompute) # We expect X and y to be float64 or float32 Fortran ordered arrays # when bypassing checks if check_input: X, y = check_X_y( X, y, accept_sparse="csc", order="F", dtype=[np.float64, np.float32], copy=self.copy_X and self.fit_intercept, multi_output=True, y_numeric=True, ) y = check_array(y, order="F", copy=False, dtype=X.dtype.type, ensure_2d=False) X, y, X_offset, y_offset, X_scale, precompute, Xy = _pre_fit( X, y, None, self.precompute, self.normalize, self.fit_intercept, copy=False) if y.ndim == 1: y = y[:, None] if Xy is not None and Xy.ndim == 1: Xy = Xy[:, None] n_samples, n_features = X.shape n_targets = y.shape[1] if self.selection not in ["cyclic", "random"]: raise ValueError("selection should be either random or cyclic.") if not self.warm_start or self.coef_ is None: coef_ = np.zeros((n_targets, n_features), dtype=X.dtype, order="F") else: coef_ = self.coef_ if coef_.ndim == 1: coef_ = coef_[None, :] dual_gaps_ = np.zeros(n_targets, dtype=X.dtype) if self.n_jobs == 1: self.n_iter_ = [] history = [] for k in range(n_targets): if self.mode == "admm": this_coef, hist, this_iter = group_lasso_overlap( X, y[:, k], lamda=self.alpha, groups=self.groups, rho=self.rho, max_iter=self.max_iter, tol=self.tol, verbose=self.verbose, rtol=self.rtol, ) elif self.mode == "paspal-matlab": this_coef, hist, this_iter = group_lasso_overlap_paspal( X, y[:, k], lamda=self.alpha, groups=self.groups, rho=self.rho, max_iter=self.max_iter, tol=self.tol, verbose=self.verbose, rtol=self.rtol, matlab_engine=self.matlab_engine, ) elif self.mode == "paspal": # paspal wrapper this_coef, hist, this_iter = glopridu_algorithm( X, y[:, k], tau=self.alpha, blocks=self.groups, max_iter_ext=self.max_iter, tol_ext=self.tol, verbose=self.verbose, tol_int=self.rtol, ) else: raise ValueError(self.mode) coef_[k] = this_coef.ravel() history.append(hist) self.n_iter_.append(this_iter) else: import joblib as jl if self.mode == "admm": coef_, history, self.n_iter_ = zip(*jl.Parallel( n_jobs=self.n_jobs)(jl.delayed(group_lasso_overlap)( X, y[:, k], lamda=self.alpha, groups=self.groups, rho=self.rho, max_iter=self.max_iter, tol=self.tol, verbose=self.verbose, rtol=self.rtol, ) for k in range(n_targets))) elif self.mode == "paspal-matlab": # paspal wrapper coef_, history, self.n_iter_ = zip(*jl.Parallel( n_jobs=self.n_jobs)(jl.delayed(group_lasso_overlap_paspal)( X, y[:, k], lamda=self.alpha, groups=self.groups, rho=self.rho, max_iter=self.max_iter, tol=self.tol, verbose=self.verbose, rtol=self.rtol, matlab_engine=self.matlab_engine, ) for k in range(n_targets))) elif self.mode == "paspal": # paspal wrapper coef_, history, self.n_iter_ = zip(*jl.Parallel( n_jobs=self.n_jobs)(jl.delayed(glopridu_algorithm)( X, y[:, k], tau=self.alpha, blocks=self.groups, max_iter_ext=self.max_iter, tol_ext=self.tol, verbose=self.verbose, tol_int=self.rtol, ) for k in range(n_targets))) else: raise ValueError(self.mode) if n_targets == 1: self.n_iter_ = self.n_iter_[0] self.coef_, self.dual_gap_ = map(np.squeeze, [coef_, dual_gaps_]) self._set_intercept(X_offset, y_offset, X_scale) # workaround since _set_intercept will cast self.coef_ into float64 self.coef_ = np.asarray(self.coef_, dtype=X.dtype) self.history_ = history # return self for chaining fit and predict calls return self
def get_normalized_lupi_intervals(self, lupi_features, presetModel=None): # We define a list of all the features we want to compute relevance bounds for X, _ = self.data # TODO: handle other data formats all_d = X.shape[1] normal_d = all_d - lupi_features # Compute relevance bounds and probes for normal features and LUPI with joblib.Parallel(n_jobs=self.n_jobs, verbose=self.verbose) as parallel: d_n = _get_necessary_dimensions(normal_d, presetModel) rb = self.compute_relevance_bounds(d_n, parallel=parallel) probe_upper = self.compute_probe_values(d_n, True, parallel=parallel) probe_lower = self.compute_probe_values(d_n, False, parallel=parallel) d_l = _get_necessary_dimensions(all_d, presetModel, start=normal_d) rb_l = self.compute_relevance_bounds(d_l, parallel=parallel) probe_priv_upper = self.compute_probe_values(d_l, True, parallel=parallel) probe_priv_lower = self.compute_probe_values(d_l, False, parallel=parallel) # # Postprocess # # Get Scaling Parameters l1 = self.init_constraints["w_l1"] l1_priv = self.init_constraints["w_priv_l1"] l1 = l1 + l1_priv # Normalize Normal and Lupi features rb_norm = self._postprocessing(l1, rb) rb_l_norm = self._postprocessing(l1, rb_l) interval_ = np.concatenate([rb_norm, rb_l_norm]) # Normalize Probes probe_lower = self._postprocessing(l1, probe_lower) probe_upper = self._postprocessing(l1, probe_upper) probe_priv_lower = self._postprocessing(l1, probe_priv_lower) probe_priv_upper = self._postprocessing(l1, probe_priv_upper) # # # Classify features self.f_classifier = FeatureClassifier(probe_lower, probe_upper, verbose=self.verbose) feature_classes = self.f_classifier.classify(rb_norm) self.f_classifier_lupi = FeatureClassifier(probe_priv_lower, probe_priv_upper, verbose=self.verbose) feature_classes_lupi = self.f_classifier_lupi.classify(rb_l_norm) fc_both = np.concatenate([feature_classes, feature_classes_lupi]) return interval_, fc_both
def __init__(self, n_jobs=1): self.n_jobs = n_jobs self.parallel = joblib.Parallel(n_jobs=self.n_jobs) self.reset()
def parallel(n): xy_combos = ((x, y) for x in range(1, n) for y in range(1, n)) digit_sum = joblib.delayed(get_digit_sum) results = joblib.Parallel(joblib.cpu_count() * 2)(digit_sum(x, y) for x, y in xy_combos) print(max(results))
('BZar.ttf', 'BZarBold.ttf', 'BZarItalic.ttf'), ('CourierNew.ttf', 'CourierNewBold.ttf', 'CourierNewItalic.ttf'), ('HelveticaNormal.ttf', 'HelveticaBold.ttf'), ('IRANSans.ttf', 'IRANSansBold.ttf', 'IRANSansItalic.ttf'), ('NotoNaskhArabic.ttf', 'NotoNaskhArabicBold.ttf', 'NotoNaskhArabicItalic.ttf'), ('Tahoma.ttf', 'TahomaBold.ttf'), ('TimesNewRoman.ttf', 'TimesNewRomanBold.ttf', 'TimesNewRomanItalic.ttf'), ] fonts = [['fonts/' + name for name in item] for item in fonts] random.shuffle(images) texts, images, fonts = itertools.cycle(texts), itertools.cycle( images), itertools.cycle(fonts) # create htmls page_htmls = [ create_page_html(texts, images, fonts, random.choice(['tabale', 'multi-col'])) for i in range(5000) ] joblib.Parallel(n_jobs=4, backend='multiprocessing')([ joblib.delayed(render)(html, address('resources/generated/{}/{}.png'.format( hashed(html)[:2], hashed(html)))) for html in page_htmls ]) # print json names # print([os.path.abspath(filename) for filename in glob('resources/generated/*/*.json')])
def main(): # Set directory locations for old and new raw/aggregate files raw_path = r"C:\Users\npbyers\Desktop\OTB\ChapNumFixes\chap_adjusted_raw_round2" manual_path = "./fix_mats/" rawfolder = "./chap_adjusted_raw_round2/" aggfolder = "./chap_adjusted_agg_round2/" rawfolder_new = "./chap_cleaned/raw/" aggfolder_new = "./chap_cleaned/agg/" # Initialize lists for each file type flag_rows_filelist = [] vol_list = [] raw_filelist = [] # Create a list of flag_rows files and a list of volumes for # which flag_rows files exist for root, dirs, files in os.walk(manual_path): for file in files: if "flag_rows" in file: folder = file.replace("_flag_rows.csv", "") flag_rows_filelist.append(manual_path + "/" + folder + "/" + file) vol_list.append(folder) # Create a list of all old raw files all_raw = [f for f in os.listdir(raw_path) if f.endswith(".csv")] # Create a list of raw files for those volumes with corresponding flag_rows # files. These are the raw files that will be sent to the 'fix_integration' # function. # If a volume did not undergo any manual fixes (and for which there is thus # no corresponding flag_rows file), the existing raw/agg files for that # volume are simply copied and pasted to the new "chap_cleaned" destination # directory. for i in all_raw: base = i.replace("_output_chapadjusted_rd2.csv", "") raw_outname_new = i.replace("_output_chapadjusted_rd2.csv", "_cleaned.csv") agg_outname_new = i.replace("_output_chapadjusted_rd2.csv", "_aggregated_cleaned.csv") agg_inname_old = i.replace("_output_chapadjusted_rd2.csv", "_aggregated_chapadjusted_rd2.csv") if base in vol_list: raw_filelist.append(rawfolder + i) else: copyfile(rawfolder + i, rawfolder_new + raw_outname_new) copyfile(aggfolder + agg_inname_old, aggfolder_new + agg_outname_new) # Create a dataframe from the file in which all manual fixes and # transcriptions from the previous step (manual review) have been recorded fixfile = r"C:\Users\npbyers\Desktop\OTB\ChapNumFixes\fix_mats\Chap_Error_Fixes_for_script.csv" fix_df = pd.read_csv(fixfile, encoding='utf-8', low_memory=False) # Extract the rows in the fix file that pertain to the volume in question # and add them as a new dataframe to a dictionary which also contains # the filepath strings for the raw and flag_rows files for that volume raw_flag_fix_dicts = [] for i in range(0, len(raw_filelist)): vol_fix_df = fix_df[fix_df['Volume'] == vol_list[i]].copy().reset_index() raw_flag_fix_dicts.append({ 'raw': raw_filelist[i], 'flag_rows': flag_rows_filelist[i], 'fixes': vol_fix_df }) # Call the 'fix_integration' function using the dictionaries created above, # one for each volume with manual fixes to be integrated. This operation # is run in parallel to reduce compute time. with joblib.parallel_backend(n_jobs=7, backend='loky'): joblib.Parallel(verbose=5)(joblib.delayed(fix_integration)(fix_dict) for fix_dict in raw_flag_fix_dicts)
# Train Linear SVM classifier print('Training the SVM classifier...') lin_svm, std_scaler, pca = classification.train_linear_svm( vis_words, labels, C=1, dim_reduction=None) print('Elapsed time: {:.2f} s'.format(time.time() - temp)) temp = time.time() # Read the test set test_images_filenames, test_labels = io.load_test_set() print('Loaded {} test images.'.format(len(test_images_filenames))) # Feature extraction with sift, prediction with SVM and aggregation to obtain final class print('Predicting test data...') test_results = joblib.Parallel(n_jobs=N_JOBS, backend='threading')( joblib.delayed(parallel_testing)(test_image, test_label, codebook, lin_svm, std_scaler, pca) for test_image, test_label in zip(test_images_filenames, test_labels)) pred_results = [x[0] for x in test_results] pred_class = [x[1] for x in test_results] pred_prob = [x[2] for x in test_results] num_correct = np.count_nonzero(pred_results) print('Elapsed time: {:.2f} s'.format(time.time() - temp)) temp = time.time() # Compute accuracy accuracy = num_correct * 100.0 / len(test_images_filenames) # Show results and timing print('\nACCURACY: {:.2f}'.format(accuracy))
def _compute_efficient(self, bw): """ Computes the bandwidth by estimating the scaling factor (c) in n_res resamples of size ``n_sub`` (in `randomize` case), or by dividing ``nobs`` into as many ``n_sub`` blocks as needed (if `randomize` is False). References ---------- See p.9 in socserv.mcmaster.ca/racine/np_faq.pdf """ if bw is None: self._bw_method = 'normal_reference' if isinstance(bw, string_types): self._bw_method = bw else: self._bw_method = "user-specified" return bw nobs = self.nobs n_sub = self.n_sub data = copy.deepcopy(self.data) n_cvars = self.data_type.count('c') co = 4 # 2*order of continuous kernel do = 4 # 2*order of discrete kernel _, ix_ord, ix_unord = _get_type_pos(self.data_type) # Define bounds for slicing the data if self.randomize: # randomize chooses blocks of size n_sub, independent of nobs bounds = [None] * self.n_res else: bounds = [(i * n_sub, (i + 1) * n_sub) for i in range(nobs // n_sub)] if nobs % n_sub > 0: bounds.append((nobs - nobs % n_sub, nobs)) n_blocks = self.n_res if self.randomize else len(bounds) sample_scale = np.empty((n_blocks, self.k_vars)) only_bw = np.empty((n_blocks, self.k_vars)) class_type, class_vars = self._get_class_vars_type() if has_joblib: # `res` is a list of tuples (sample_scale_sub, bw_sub) res = joblib.Parallel(n_jobs=self.n_jobs) \ (joblib.delayed(_compute_subset) \ (class_type, data, bw, co, do, n_cvars, ix_ord, ix_unord, \ n_sub, class_vars, self.randomize, bounds[i]) \ for i in range(n_blocks)) else: res = [] for i in range(n_blocks): res.append( _compute_subset(class_type, data, bw, co, do, n_cvars, ix_ord, ix_unord, n_sub, class_vars, self.randomize, bounds[i])) for i in range(n_blocks): sample_scale[i, :] = res[i][0] only_bw[i, :] = res[i][1] s = self._compute_dispersion(data) order_func = np.median if self.return_median else np.mean m_scale = order_func(sample_scale, axis=0) # TODO: Check if 1/5 is correct in line below! bw = m_scale * s * nobs**(-1. / (n_cvars + co)) bw[ix_ord] = m_scale[ix_ord] * nobs**(-2. / (n_cvars + do)) bw[ix_unord] = m_scale[ix_unord] * nobs**(-2. / (n_cvars + do)) if self.return_only_bw: bw = np.median(only_bw, axis=0) return bw
def parallel_map(f, items, cpus=PARALLEL_MAP_CPUS): return joblib.Parallel(n_jobs=cpus)(joblib.delayed(f)(item) for item in items)
def start_kernel(self): self.DTYPE = self._dtype # Get const values self.MASS_LEN = len(self) self.SIM_DIM = len(self._mass_list[0]._r) # Init multiprocessing pool self.CPU_LEN = self._threads self.cpu_pool = joblib.Parallel( n_jobs=self.CPU_LEN, prefer='processes' # alternative: 'threads' ) self.data_pool = [] for _ in range(self.CPU_LEN): self.data_pool.append({ # Allocate memory: Object parameters 'mass_r_array': np.zeros((self.MASS_LEN, self.SIM_DIM), dtype=self.DTYPE), 'mass_a_array': np.zeros((self.MASS_LEN, self.SIM_DIM), dtype=self.DTYPE), 'mass_m_array': np.zeros((self.MASS_LEN, ), dtype=self.DTYPE), # Allocate memory: Temporary variables 'relative_r': np.zeros((self.MASS_LEN - 1, self.SIM_DIM), dtype=self.DTYPE), 'distance_sq': np.zeros((self.MASS_LEN - 1, ), dtype=self.DTYPE), 'distance_sqv': np.zeros((self.MASS_LEN - 1, self.SIM_DIM), dtype=self.DTYPE), 'distance_inv': np.zeros((self.MASS_LEN - 1, ), dtype=self.DTYPE), 'a_factor': np.zeros((self.MASS_LEN - 1, ), dtype=self.DTYPE), 'a1': np.zeros((self.MASS_LEN - 1, ), dtype=self.DTYPE), 'a1r': np.zeros((self.MASS_LEN - 1, self.SIM_DIM), dtype=self.DTYPE), 'a1v': np.zeros((self.SIM_DIM, ), dtype=self.DTYPE), 'a2': np.zeros((self.MASS_LEN - 1, ), dtype=self.DTYPE), 'a2r': np.zeros((self.MASS_LEN - 1, self.SIM_DIM), dtype=self.DTYPE), 'G': self._G, }) # Copy const data into Numpy infrastructure for pm_index, pm in enumerate(self._mass_list): self.data_pool[0]['mass_m_array'][pm_index] = pm._m for data_set in self.data_pool[1:]: data_set['mass_m_array'][:] = self.data_pool[0]['mass_m_array'][:] # Compute line index tuples for evenly sized batches total_pairs = (self.MASS_LEN * (self.MASS_LEN - 1)) // 2 batch_length = total_pairs // self.CPU_LEN self.index_pool = [] pair_count = 0 start_line = 0 for line in range(1, self.MASS_LEN - 1): pair_count += (self.MASS_LEN - 1 - line) if pair_count < batch_length: continue pair_count = 0 self.index_pool.append((start_line, line)) start_line = line assert len(self.index_pool) in [(self.CPU_LEN - 1), self.CPU_LEN] if len(self.index_pool) == self.CPU_LEN - 1: self.index_pool.append((start_line, self.MASS_LEN - 1)) assert self.index_pool[-1][1] == self.MASS_LEN - 1
vpi_out_arr = np.array(vpi_out_lst) vpi_out_arr = 100 * vpi_out_arr vci_out_arr = np.array(vci_out_lst) # write VCI array to disc out_pth_vci = r'Y:\germany-drought\VCI_VPI\\' + tile + r'\\' + out_descr + '_VCI.tif' writeRasterInt(vci_out_arr, out_pth_vci, gt, pr, -32767) ## Write VPI array to disc out_pth_vpi = r'Y:\germany-drought\VCI_VPI\\' + tile + r'\\' + out_descr + '_VPI.tif' writeRasterInt(vpi_out_arr, out_pth_vpi, gt, pr, -32767) ## Optional: Writing statistic arrays to disc # out_pth_min = r'Y:\germany-drought\VCI_VPI\\' + tile + r'\\' + bl + '_NDVI_MIN.tif' # writeRasterInt(min_arr, out_pth_min, gt, pr, -32767) # out_pth_max = r'Y:\germany-drought\VCI_VPI\\' + tile + r'\\' + bl + '_NDVI_MAX.tif' # writeRasterInt(max_arr, out_pth_max, gt, pr, -32767) # out_pth_std = r'Y:\germany-drought\VCI_VPI\\' + tile + r'\\' + bl + '_NDVI_STD.tif' # writeRasterFloat(out_pth_std, out_pth, gt, pr, -32767) # out_pth_avg = r'Y:\germany-drought\VCI_VPI\\' + tile + r'\\' + bl + '_NDVI_AVG.tif' # writeRasterFloat(out_pth_avg, out_pth, gt, pr, -32767) print('Done: ' + tile) if __name__ == '__main__': joblib.Parallel(n_jobs=40)(joblib.delayed(workFunc)(i) for i in job_lst) endtime = time.strftime("%a, %d %b %Y %H:%M:%S", time.localtime()) print("Start time " + str(starttime)) print("End time " + str(endtime))
import joblib dst_dir = 'data/temp/joblib/dst_img' os.makedirs(dst_dir, exist_ok=True) files = glob.glob('data/temp/joblib/src_img/*') for f in files: try: img = Image.open(f) img_resize = img.resize((img.width // 2, img.height // 2)) root, ext = os.path.splitext(f) basename = os.path.basename(root) img_resize.save(os.path.join(dst_dir, basename + '_half' + ext)) except OSError as e: pass def func(f): try: img = Image.open(f) img_resize = img.resize((img.width // 2, img.height // 2)) root, ext = os.path.splitext(f) basename = os.path.basename(root) img_resize.save(os.path.join(dst_dir, basename + '_half' + ext)) except OSError as e: pass _ = joblib.Parallel(n_jobs=-1)(joblib.delayed(func)(f) for f in files)
def cvglmnet(*, x, y, family='gaussian', ptype='default', nfolds=10, foldid=scipy.empty([0]), parallel=1, keep=False, grouped=True, **options): options = glmnetSet(options) if 0 < len(options['lambdau']) < 2: raise ValueError('Need more than one value of lambda for cv.glmnet') nobs = x.shape[0] # we should not really need this. user must supply the right shape # if y.shape[0] != nobs: # y = scipy.transpose(y) # convert 1d python array of size nobs to 2d python array of size nobs x 1 if len(y.shape) == 1: y = scipy.reshape(y, [y.size, 1]) # we should not really need this. user must supply the right shape # if (len(options['offset']) > 0) and (options['offset'].shape[0] != nobs): # options['offset'] = scipy.transpose(options['offset']) if len(options['weights']) == 0: options['weights'] = scipy.ones([nobs, 1], dtype=scipy.float64) # main call to glmnet glmfit = glmnet(x=x, y=y, family=family, **options) is_offset = glmfit['offset'] options['lambdau'] = glmfit['lambdau'] nz = glmnetPredict(glmfit, scipy.empty([0]), scipy.empty([0]), 'nonzero') if glmfit['class'] == 'multnet': nnz = scipy.zeros([len(options['lambdau']), len(nz)]) for i in range(len(nz)): nnz[:, i] = scipy.transpose(scipy.sum(nz[i], axis=0)) nz = scipy.ceil(scipy.median(nnz, axis=1)) elif glmfit['class'] == 'mrelnet': nz = scipy.transpose(scipy.sum(nz[0], axis=0)) else: nz = scipy.transpose(scipy.sum(nz, axis=0)) if len(foldid) == 0: ma = scipy.tile(scipy.arange(nfolds), [1, int(scipy.floor(nobs / nfolds))]) mb = scipy.arange(scipy.mod(nobs, nfolds)) mb = scipy.reshape(mb, [1, mb.size]) population = scipy.append(ma, mb, axis=1) mc = scipy.random.permutation(len(population)) mc = mc[0:nobs] foldid = population[mc] foldid = scipy.reshape(foldid, [ foldid.size, ]) else: nfolds = scipy.amax(foldid) + 1 if nfolds < 3: raise ValueError( 'nfolds must be bigger than 3; nfolds = 10 recommended') cpredmat = list() foldid = scipy.reshape(foldid, [ foldid.size, ]) if parallel != 1: if parallel == -1: num_cores = multiprocessing.cpu_count() else: num_cores = parallel sys.stderr.write("[status]\tParallel glmnet cv with " + str(num_cores) + " cores\n") cpredmat = joblib.Parallel(n_jobs=num_cores)(joblib.delayed(doCV)( i, x, y, family, foldid, nfolds, is_offset, **options) for i in range(nfolds)) else: for i in range(nfolds): newFit = doCV(i, x, y, family, foldid, nfolds, is_offset, **options) cpredmat.append(newFit) if cpredmat[0]['class'] == 'elnet': cvstuff = cvelnet(cpredmat, options['lambdau'], x, y, options['weights'], options['offset'], foldid, ptype, grouped, keep) elif cpredmat[0]['class'] == 'lognet': cvstuff = cvlognet(cpredmat, options['lambdau'], x, y, options['weights'], options['offset'], foldid, nfolds, ptype, grouped, keep) elif cpredmat[0]['class'] == 'multnet': cvstuff = cvmultnet(cpredmat, options['lambdau'], x, y, options['weights'], options['offset'], foldid, ptype, grouped, keep) elif cpredmat[0]['class'] == 'mrelnet': cvstuff = cvmrelnet(cpredmat, options['lambdau'], x, y, options['weights'], options['offset'], foldid, ptype, grouped, keep) elif cpredmat[0]['class'] == 'fishnet': cvstuff = cvfishnet(cpredmat, options['lambdau'], x, y, options['weights'], options['offset'], foldid, ptype, grouped, keep) elif cpredmat[0]['class'] == 'coxnet': raise NotImplementedError( 'Cross-validation for coxnet not implemented yet.') #cvstuff = cvcoxnet(cpredmat, options['lambdau'], x, y \ # , options['weights'], options['offset'] \ # , foldid, ptype, grouped, keep) cvm = cvstuff['cvm'] cvsd = cvstuff['cvsd'] cvname = cvstuff['name'] CVerr = dict() CVerr['lambdau'] = options['lambdau'] CVerr['cvm'] = scipy.transpose(cvm) CVerr['cvsd'] = scipy.transpose(cvsd) CVerr['cvup'] = scipy.transpose(cvm + cvsd) CVerr['cvlo'] = scipy.transpose(cvm - cvsd) CVerr['nzero'] = nz CVerr['name'] = cvname CVerr['glmnet_fit'] = glmfit if keep: CVerr['fit_preval'] = cvstuff['fit_preval'] CVerr['foldid'] = foldid if ptype == 'auc': cvm = -cvm aa = options['lambdau'][cvm <= scipy.amin(cvm)] if len(aa) > 0: CVerr['lambda_min'] = scipy.amax(aa).reshape([1]) idmin = options['lambdau'] == CVerr['lambda_min'] semin = cvm[idmin] + cvsd[idmin] CVerr['lambda_1se'] = scipy.amax( options['lambdau'][cvm <= semin]).reshape([1]) else: CVerr['lambda_min'] = [1000] CVerr['lambda_1se'] = [1000] CVerr['class'] = 'cvglmnet' return CVerr
def _maybe_parallelize_psi( event_annotation, reads2d, isoform1_junctions, isoform2_junctions, min_reads=MIN_READS, method='mean', uneven_coverage_multiplier=UNEVEN_COVERAGE_MULTIPLIER, n_jobs=-1): """If n_jobs!=1, run the parallelized version of psi Parameters ---------- event_annotation : pandas.DataFrame A table of all possible events, with event ids as the index (row names) and all junctions described, and contains the columns described by ``isoform1_junctions`` and ``isoform_junctions`` reads2d : pandas.DataFrame A (n_samples, n_total_junctions) table of the number of reads found in all samples' exon-exon, all junctions. Very very large, e.g. 1000 samples x 50,000 junctions = 50 million elements isoform1_junctions : list of str Junction numbers corresponding to isoform 1, e.g. ['junction13'] isoform2_junctions : list of str Junction numbers corresponding to isoform 2, e.g. ['junction12', 'junction23'] min_reads : int, optional Minimum number of reads for a junction to be viable. The rules governing compatibility of events are complex, and it is recommended to read the documentation for ``outrigger psi`` (default=10) method : "mean" | "min", optional Denotes the method by which to aggregate junctions from the same isoform - either use the mean (default) or the minimum. (default="mean") uneven_coverage_multiplier : int, optional Scale factor for the maximum amount bigger one side of a junction can be before rejecting the event, e.g. for an SE event with two junctions, junction12 and junction23, junction12=40 but junction23=500, then this event would be rejected because 500 > 40*10 (default=10) n_jobs : int, optional Number of subprocesses to create. Default is -1, which is to use as many processes/cores as possible Returns ------- summary : pandas.DataFrame A (n_samples * n_events, 7) shaped table with the sample id, junction reads, percent spliced-in (Psi), and notes on each event in each sample, that explains why or why not Psi was calculated """ # There are multiple rows with the same event id because the junctions # are the same, but the flanking exons may be a little wider or shorter, # but ultimately the event Psi is calculated only on the junctions so the # flanking exons don't matter for this. But, all the exons are in # exon\d.bed in the index! And you, the lovely user, can decide what you # want to do with them! grouped = event_annotation.groupby(level=0, axis=0) n_events = len(grouped.size()) if n_jobs == 1: # Do a separate branch because joblib doesn't do a good job of # managing the python debugger so use --n-jobs=1 (n_jobs=1) when # debugging progress('\tIterating over {} events ...\n'.format(n_events)) summaries = [] for event_id, event_df in grouped: summary = _single_event_psi( event_id, event_df, reads2d, isoform1_junctions, isoform2_junctions, min_reads=min_reads, uneven_coverage_multiplier=uneven_coverage_multiplier, method=method) summaries.append(summary) else: processors = n_jobs if n_jobs > 0 else joblib.cpu_count() progress("\tParallelizing {} events' Psi calculation across {} " "CPUs ...\n".format(n_events, processors)) summaries = joblib.Parallel(n_jobs=n_jobs)( joblib.delayed(_single_event_psi)( event_id, event_df, reads2d, isoform1_junctions, isoform2_junctions, min_reads=min_reads, uneven_coverage_multiplier=uneven_coverage_multiplier, method=method) for event_id, event_df in grouped) return summaries
def train(self, data, parallel=True): """ Train CCA with cross-validation for a set of regularization coefficients and/or numbers of CCs Attributes: data (list): training data matrices (number of samples X number of features). Number of samples must match across datasets. parallel (bool): use joblib to train cross-validation folds in parallel """ nT = data[0].shape[0] chunklen = 10 if nT > 50 else 1 nchunks = int(0.2 * nT / chunklen) indchunks = zip(*[iter(range(nT))] * chunklen) corr_mat = np.zeros((len(self.regs), len(self.numCCs))) selection = max(int(self.select * min([d.shape[1] for d in data])), 1) for ri, reg in enumerate(self.regs): for ci, numCC in enumerate(self.numCCs): running_corr_mean_sum = 0. if parallel: fold_corr_means = joblib.Parallel(n_jobs=self.numCV)( joblib.delayed(train_cvfold)(data=data, reg=reg, numCC=numCC, kernelcca=self.kernelcca, ktype=self.ktype, gausigma=self.gausigma, degree=self.degree, cutoff=self.cutoff, selection=selection) for fold in range(self.numCV)) running_corr_mean_sum += sum(fold_corr_means) else: for cvfold in range(self.numCV): fold_corr_mean = train_cvfold(data=data, reg=reg, numCC=numCC, kernelcca=self.kernelcca, ktype=self.ktype, gausigma=self.gausigma, degree=self.degree, cutoff=self.cutoff, selection=selection) running_corr_mean_sum += fold_corr_mean corr_mat[ri, ci] = running_corr_mean_sum / self.numCV best_ri, best_ci = np.where(corr_mat == corr_mat.max()) self.best_reg = self.regs[best_ri[0]] self.best_numCC = self.numCCs[best_ci[0]] comps = kcca(data, self.best_reg, self.best_numCC, kernelcca=self.kernelcca, ktype=self.ktype, gausigma=self.gausigma, degree=self.degree) self.cancorrs, self.ws, self.comps = recon(data, comps, kernelcca=self.kernelcca) if len(data) == 2: self.cancorrs = self.cancorrs[np.nonzero(self.cancorrs)] return self
fns.sort() for fn in fns[1:]: ncname = os.path.split(fn)[1] print "loading...",fn ncf = NCFile(fn, cache_s21_raw=True) print "accessing sweep..." swa = ncf.SweepArray0 print "extracting data..." data = [] for k in range(32): swp = swa.sweep(k) data.append((swp.frequency, swp.s21_point, swp.s21_point_error)) print "starting parallel jobs..." pp = joblib.Parallel(n_jobs=16,verbose=5) results = pp([joblib.delayed(process_sweep)(*args) for args in data]) results = list(itertools.chain.from_iterable(results)) print "saving results..." print joblib.dump(results,('/home/gjones/%s_resonators.pkl' % ncname),compress=True) print "plotting..." fig,ax = plt.subplots() for k in range(32): swp = swa.sweep(k) ax.plot(swp.frequency, 20 * np.log10(np.abs(swp.s21_point))) for params in results: f0 = params['f_0'].value if f0 > swp.frequency.min() and f0 < swp.frequency.max():
def permuted_ols(tested_vars, target_vars, confounding_vars=None, model_intercept=True, n_perm=10000, two_sided_test=True, random_state=None, n_jobs=1, verbose=0): """Massively univariate group analysis with permuted OLS. Tested variates are independently fitted to target variates descriptors (e.g. brain imaging signal) according to a linear model solved with an Ordinary Least Squares criterion. Confounding variates may be included in the model. Permutation testing is used to assess the significance of the relationship between the tested variates and the target variates [1]_, [2]_. A max-type procedure is used to obtain family-wise corrected p-values. The specific permutation scheme implemented here is the one of [3]_. Its has been demonstrated in [1]_ that this scheme conveys more sensitivity than alternative schemes. This holds for neuroimaging applications, as discussed in details in [2]_. Permutations are performed on parallel computing units. Each of them performs a fraction of permutations on the whole dataset. Thus, the max t-score amongst data descriptors can be computed directly, which avoids storing all the computed t-scores. The variates should be given C-contiguous. target_vars are fortran-ordered automatically to speed-up computations. Parameters ---------- tested_vars : array-like, shape=(n_samples, n_regressors) Explanatory variates, fitted and tested independently from each others. target_vars : array-like, shape=(n_samples, n_descriptors) fMRI data, trying to be explained by explanatory and confounding variates. confounding_vars : array-like, shape=(n_samples, n_covars), optional Confounding variates (covariates), fitted but not tested. If None, no confounding variate is added to the model (except maybe a constant column according to the value of `model_intercept`) model_intercept : bool, optional If True, a constant column is added to the confounding variates unless the tested variate is already the intercept. Default=True n_perm : int, optional Number of permutations to perform. Permutations are costly but the more are performed, the more precision one gets in the p-values estimation. Default=10000. two_sided_test : boolean, optional If True, performs an unsigned t-test. Both positive and negative effects are considered; the null hypothesis is that the effect is zero. If False, only positive effects are considered as relevant. The null hypothesis is that the effect is zero or negative. Default=True. random_state : int or None, optional Seed for random number generator, to have the same permutations in each computing units. n_jobs : int, optional Number of parallel workers. If 0 is provided, all CPUs are used. A negative number indicates that all the CPUs except (abs(n_jobs) - 1) ones will be used. Default=1. verbose : int, optional verbosity level (0 means no message). Default=0. Returns ------- pvals : array-like, shape=(n_regressors, n_descriptors) Negative log10 p-values associated with the significance test of the n_regressors explanatory variates against the n_descriptors target variates. Family-wise corrected p-values. score_orig_data : numpy.ndarray, shape=(n_regressors, n_descriptors) t-statistic associated with the significance test of the n_regressors explanatory variates against the n_descriptors target variates. The ranks of the scores into the h0 distribution correspond to the p-values. h0_fmax : array-like, shape=(n_perm, ) Distribution of the (max) t-statistic under the null hypothesis (obtained from the permutations). Array is sorted. References ---------- .. [1] Anderson, M. J. & Robinson, J. (2001). Permutation tests for linear models. Australian & New Zealand Journal of Statistics, 43(1), 75-88. .. [2] Winkler, A. M. et al. (2014). Permutation inference for the general linear model. Neuroimage. .. [3] Freedman, D. & Lane, D. (1983). A nonstochastic interpretation of reported significance levels. J. Bus. Econ. Stats., 1(4), 292-298 """ # initialize the seed of the random generator rng = check_random_state(random_state) # check n_jobs (number of CPUs) if n_jobs == 0: # invalid according to joblib's conventions raise ValueError("'n_jobs == 0' is not a valid choice. " "Please provide a positive number of CPUs, or -1 " "for all CPUs, or a negative number (-i) for " "'all but (i-1)' CPUs (joblib conventions).") elif n_jobs < 0: n_jobs = max(1, joblib.cpu_count() - int(n_jobs) + 1) else: n_jobs = min(n_jobs, joblib.cpu_count()) # make target_vars F-ordered to speed-up computation if target_vars.ndim != 2: raise ValueError( "'target_vars' should be a 2D array. " "An array with %d dimension%s was passed" % (target_vars.ndim, "s" if target_vars.ndim > 1 else "")) target_vars = np.asfortranarray(target_vars) # efficient for chunking n_descriptors = target_vars.shape[1] if np.any(np.all(target_vars == 0, axis=0)): warnings.warn( "Some descriptors in 'target_vars' have zeros across all " "samples. These descriptors will be ignored during null " "distribution generation.") # check explanatory variates dimensions if tested_vars.ndim == 1: tested_vars = np.atleast_2d(tested_vars).T n_samples, n_regressors = tested_vars.shape # check if explanatory variates is intercept (constant) or not if (n_regressors == 1 and np.unique(tested_vars).size == 1): intercept_test = True else: intercept_test = False # optionally add intercept if model_intercept and not intercept_test: if confounding_vars is not None: confounding_vars = np.hstack( (confounding_vars, np.ones((n_samples, 1)))) else: confounding_vars = np.ones((n_samples, 1)) ### OLS regression on original data if confounding_vars is not None: # step 1: extract effect of covars from target vars covars_orthonormalized = orthonormalize_matrix(confounding_vars) if not covars_orthonormalized.flags['C_CONTIGUOUS']: # useful to developer warnings.warn('Confounding variates not C_CONTIGUOUS.') covars_orthonormalized = np.ascontiguousarray( covars_orthonormalized) targetvars_normalized = normalize_matrix_on_axis( target_vars).T # faster with F-ordered target_vars_chunk if not targetvars_normalized.flags['C_CONTIGUOUS']: # useful to developer warnings.warn('Target variates not C_CONTIGUOUS.') targetvars_normalized = np.ascontiguousarray(targetvars_normalized) beta_targetvars_covars = np.dot(targetvars_normalized, covars_orthonormalized) targetvars_resid_covars = targetvars_normalized - np.dot( beta_targetvars_covars, covars_orthonormalized.T) targetvars_resid_covars = normalize_matrix_on_axis( targetvars_resid_covars, axis=1) # step 2: extract effect of covars from tested vars testedvars_normalized = normalize_matrix_on_axis(tested_vars.T, axis=1) beta_testedvars_covars = np.dot(testedvars_normalized, covars_orthonormalized) testedvars_resid_covars = testedvars_normalized - np.dot( beta_testedvars_covars, covars_orthonormalized.T) testedvars_resid_covars = normalize_matrix_on_axis( testedvars_resid_covars, axis=1).T.copy() else: targetvars_resid_covars = normalize_matrix_on_axis(target_vars).T testedvars_resid_covars = normalize_matrix_on_axis(tested_vars).copy() covars_orthonormalized = None # check arrays contiguousity (for the sake of code efficiency) if not targetvars_resid_covars.flags['C_CONTIGUOUS']: # useful to developer warnings.warn('Target variates not C_CONTIGUOUS.') targetvars_resid_covars = np.ascontiguousarray(targetvars_resid_covars) if not testedvars_resid_covars.flags['C_CONTIGUOUS']: # useful to developer warnings.warn('Tested variates not C_CONTIGUOUS.') testedvars_resid_covars = np.ascontiguousarray(testedvars_resid_covars) # step 3: original regression (= regression on residuals + adjust t-score) # compute t score for original data scores_original_data = _t_score_with_covars_and_normalized_design( testedvars_resid_covars, targetvars_resid_covars.T, covars_orthonormalized) if two_sided_test: sign_scores_original_data = np.sign(scores_original_data) scores_original_data = np.fabs(scores_original_data) ### Permutations # parallel computing units perform a reduced number of permutations each if n_perm > n_jobs: n_perm_chunks = np.asarray([n_perm / n_jobs] * n_jobs, dtype=int) n_perm_chunks[-1] += n_perm % n_jobs elif n_perm > 0: warnings.warn('The specified number of permutations is %d and ' 'the number of jobs to be performed in parallel has ' 'set to %s. This is incompatible so only %d jobs will ' 'be running. You may want to perform more permutations ' 'in order to take the most of the available computing ' 'ressources.' % (n_perm, n_jobs, n_perm)) n_perm_chunks = np.ones(n_perm, dtype=int) else: # 0 or negative number of permutations => original data scores only if two_sided_test: scores_original_data = (scores_original_data * sign_scores_original_data) return np.asarray([]), scores_original_data.T, np.asarray([]) # actual permutations, seeded from a random integer between 0 and maximum # value represented by np.int32 (to have a large entropy). ret = joblib.Parallel(n_jobs=n_jobs, verbose=verbose)( joblib.delayed(_permuted_ols_on_chunk)(scores_original_data, testedvars_resid_covars, targetvars_resid_covars.T, thread_id + 1, covars_orthonormalized, n_perm=n_perm, n_perm_chunk=n_perm_chunk, intercept_test=intercept_test, two_sided_test=two_sided_test, random_state=rng.randint( 1, np.iinfo(np.int32).max - 1), verbose=verbose) for thread_id, n_perm_chunk in enumerate(n_perm_chunks)) # reduce results scores_as_ranks_parts, h0_fmax_parts = zip(*ret) h0_fmax = np.hstack((h0_fmax_parts)) scores_as_ranks = np.zeros((n_regressors, n_descriptors)) for scores_as_ranks_part in scores_as_ranks_parts: scores_as_ranks += scores_as_ranks_part # convert ranks into p-values pvals = (n_perm + 1 - scores_as_ranks) / float(1 + n_perm) # put back sign on scores if it was removed in the case of a two-sided test # (useful to distinguish between positive and negative effects) if two_sided_test: scores_original_data = scores_original_data * sign_scores_original_data return -np.log10(pvals), scores_original_data.T, h0_fmax[0]
def _run_algos(): name_config = "" for config_name, config in configs_getter.get_configs(): name_config = config_name fpath = os.path.join(os.path.dirname(__file__), "../../output/metrics_draft", f'{name_config}.csv') tmp_dirpath = f'{fpath}.tmp_results' os.makedirs(tmp_dirpath, exist_ok=True) atexit.register(shutil.rmtree, tmp_dirpath) tmp_files_idx = 0 delayed_jobs = [] nb_stocks_flag = [int(nb) for nb in FLAGS.nb_stocks or []] for config_name, config in configs_getter.get_configs(): print(f'Config {config_name}', config) config.algos = [ a for a in config.algos if FLAGS.algos is None or a in FLAGS.algos ] if nb_stocks_flag: config.nb_stocks = [ a for a in config.nb_stocks if a in nb_stocks_flag ] combinations = list( itertools.product( config.algos, config.dividends, config.maturities, config.nb_dates, config.nb_paths, config.nb_stocks, config.payoffs, config.drift, config.spots, config.stock_models, config.strikes, config.volatilities, config.mean, config.speed, config.correlation, config.hurst, config.nb_epochs, config.hidden_size, config.factors, config.ridge_coeff, config.train_ITM_only, config.use_path)) # random.shuffle(combinations) for params in combinations: for i in range(config.nb_runs): tmp_file_path = os.path.join(tmp_dirpath, str(tmp_files_idx)) tmp_files_idx += 1 delayed_jobs.append( joblib.delayed(_run_algo)( tmp_file_path, *params, fail_on_error=FLAGS.print_errors)) print(f"Running {len(delayed_jobs)} tasks using " f"{FLAGS.nb_jobs}/{NUM_PROCESSORS} CPUs...") joblib.Parallel(n_jobs=FLAGS.nb_jobs)(delayed_jobs) print(f'Writing results to {fpath}...') with open(fpath, "w") as csvfile: writer = csv.DictWriter(csvfile, fieldnames=_CSV_HEADERS) writer.writeheader() for idx in range(tmp_files_idx): tmp_file_path = os.path.join(tmp_dirpath, str(idx)) try: with open(tmp_file_path, "r") as read_f: csvfile.write(read_f.read()) except FileNotFoundError: pass return fpath
Species_To_Store = np.array([False, False, False, False, True, True]) BatchNum = 400 # Number of replicate datasets import joblib as jb from functools import partial ntasks = 40 for i in range(len(File_list)): input_file = File_list[i] # compute moments for BatchNum replicate datasets #Moms_time_data = [] #for n in range(BatchNum): #data = Load_moms_time(input_file, Moments, keep_species = Species_To_Store) #Moms_time_data.append(data) Load_moms_time_p = partial(Load_moms_time, input_filename=input_file, Moments=Moments, keep_species=Species_To_Store) Moms_time_data = jb.Parallel(n_jobs=ntasks)(jb.delayed(Load_moms_time_p)() for n in range(BatchNum)) # preform GRN inference for each dataset in the Batch Batch_Inference( Moms_time_data, [DLab_m[i] + "(#%d)" % n for n in range(BatchNum)], DLab_m[i], shift=30, sub_sample=15, PDF_Save_dir='/nfs/datanumerik/people/araharin/Data_032021/PDF', GRN_Save_dir='/nfs/datanumerik/people/araharin/Data_032021/GRNs', indexes=indexes)
def resize(self, input_path, size=(1024, 768), output_dirpath=None, output_prefix='resized_image', n_jobs=-1): def __resize(img_path, size, output_dirpath, output_prefix): print(img_path) # open the original image img_original = skimage.io.imread(img_path)[:, :, :3] h, w, c = img_original.shape resize_ratio = None if max(h, w) == h: resize_ratio = size[1] / h else: resize_ratio = size[0] / w # resize the original image to target size img = skimage.transform.rescale(img_original, resize_ratio, mode='reflect', anti_aliasing=True, multichannel=True) # make background img_original_2x = skimage.transform.rescale(img_original, 1.5, mode='reflect', anti_aliasing=True, multichannel=True) bg_img = self.__augmentation_generate_background(img_original_2x) bh, bw, bc = bg_img.shape block_size = img.shape[ 0] if img.shape[0] > img.shape[1] else img.shape[1] x0 = int((bg_img.shape[0] - block_size) / 2) x1 = x0 + block_size y0 = int((bg_img.shape[1] - block_size) / 2) y1 = y0 + block_size bg_img = bg_img[x0:x1, y0:y1] # synthesis img = self.__get_padding(img) mask = skimage.color.rgb2gray(img) mask = np.pad( skimage.transform.resize( mask, (mask.shape[0] - 2, mask.shape[1] - 2), mode='constant'), 1, self.__zero_padding) img[mask < 0.001] = bg_img[mask < 0.001] # print out resized image if output_dirpath is None: output_dirpath = '' new_file_path = os.path.join( output_dirpath, output_prefix + '_' + os.path.basename(img_path)) skimage.io.imsave(new_file_path, img) # print out xml, if given xml_path = os.path.splitext(img_path)[0] + '.xml' new_file_path = os.path.join( output_dirpath, output_prefix + '_' + os.path.basename(xml_path)) print(new_file_path) if xml_path is not None: # shift if max(h, w) == h: shift_h = 0 shift_w = (h - w) * resize_ratio / 2 else: shift_w = 0 shift_h = (w - h) * resize_ratio / 2 re_width = re.compile(r'<width>([0-9]+)</width>') re_height = re.compile(r'<height>([0-9]+)</height>') re_xmin = re.compile(r'<xmin>([0-9]+)</xmin>') re_xmax = re.compile(r'<xmax>([0-9]+)</xmax>') re_ymin = re.compile(r'<ymin>([0-9]+)</ymin>') re_ymax = re.compile(r'<ymax>([0-9]+)</ymax>') with open(xml_path, 'r') as inxml, open(new_file_path, 'w') as outxml: for buf in inxml: v = None if '<width>' in buf: v = re_width.search(buf).group(1) buf = buf.replace(v, str(img.shape[1])) elif '<height>' in buf: v = re_height.search(buf).group(1) buf = buf.replace(v, str(img.shape[0])) elif 'xmin' in buf: v = re_xmin.search(buf).group(1) buf = buf.replace( v, str(int(int(v) * resize_ratio + shift_w))) elif 'xmax' in buf: v = re_xmax.search(buf).group(1) buf = buf.replace( v, str(int(int(v) * resize_ratio + shift_w))) elif 'ymin' in buf: v = re_ymin.search(buf).group(1) buf = buf.replace( v, str(int(int(v) * resize_ratio + shift_h))) elif 'ymax' in buf: v = re_ymax.search(buf).group(1) buf = buf.replace( v, str(int(int(v) * resize_ratio + shift_h))) outxml.write(buf) image_files = [] if os.path.isfile(input_path): image_files = [input_path] elif os.path.isdir(input_path): print(input_path) for f in os.listdir(input_path): if os.path.splitext(f)[1] in self.image_extension: image_files.append(os.path.join(input_path, f)) r = joblib.Parallel(n_jobs=n_jobs, verbose=0)([ joblib.delayed(__resize)(image_file, size, output_dirpath, output_prefix) for image_file in image_files ])
def Parallel(**kwargs: Any) -> Any: # TODO: disable lint complaint """Adapter for joblib.Parallel so we could if desired, centralize control""" # ATM just a straight invocation import joblib return joblib.Parallel(**kwargs)
def tune(self, ncores=1, csvname=None, verbose=True): """ This function starts the tuning process with specified number of processors :param ncores: (int) number of parallel processors (see the **Notes** section below for an important note about parallel execution) :param csvname: (str) the name of the csv file name to save the tuning results (useful for expensive cases as the csv file is updated directly after the case is done) :param verbose: (bool) whether to print updates to the screen or not """ self.ncores = ncores self.csvlogger = csvname self.verbose = verbose if self.verbose: print( '***************************************************************' ) print( '****************Grid Search is Running*************************' ) print( '***************************************************************' ) if self.ncores > 1: print('--- Running in parallel with {} cores'.format( self.ncores)) if self.csvlogger: headers = ['id'] + self.param_names + ['score'] with open(self.csvlogger, 'w') as csvfile: csvwriter = csv.writer(csvfile, delimiter=',', quoting=csv.QUOTE_MINIMAL, lineterminator='\n') csvwriter.writerow(headers) core_lst = [] for i in range(len(self.hyperparameter_cases)): core_lst.append([i + 1, self.hyperparameter_cases[i]]) if self.ncores > 1: #p=Pool(self.ncores) #results = p.map(self.worker, core_lst) #p.close() #p.join() with joblib.Parallel(n_jobs=self.ncores) as parallel: results = parallel( joblib.delayed(self.worker)(item) for item in core_lst) else: results = [] for item in core_lst: results.append(self.worker(item)) gridres = pd.DataFrame(self.hyperparameter_cases, columns=self.param_names) gridres.index += 1 gridres['score'] = results #gridres = gridres.sort_values(['score'], axis='index', ascending=False) return gridres