def test_dont_assume_function_purity(loop): with cluster() as (s, [a, b]): with parallel_backend('distributed', loop=loop, scheduler_host=('127.0.0.1', s['port'])): x, y = Parallel()(delayed(random2)() for i in range(2)) assert x != y
def test_simple(loop): with cluster() as (s, [a, b]): with parallel_backend('distributed', loop=loop, scheduler_host=('127.0.0.1', s['port'])): seq = Parallel()(delayed(inc)(i) for i in range(10)) assert seq == [inc(i) for i in range(10)] seq = Parallel()(delayed(inc)(i) for i in range(10)) assert seq == [inc(i) for i in range(10)]
def test_dont_assume_function_purity(loop): with cluster() as (s, [a, b]): with parallel_backend('distributed', loop=loop, scheduler_host=('127.0.0.1', s['port'])) as p: x, y = Parallel()(delayed(random2)() for i in range(2)) assert x != y from joblib.parallel import get_active_backend ba, _ = get_active_backend() ba.executor.shutdown()
def test_simple(loop): with cluster() as (s, [a, b]): with parallel_backend('distributed', loop=loop, scheduler_host=('127.0.0.1', s['port'])) as p: seq = Parallel()(delayed(inc)(i) for i in range(10)) assert seq == [inc(i) for i in range(10)] with pytest.raises(ValueError): Parallel()(delayed(slow_raise_value_error)(i == 3) for i in range(10)) seq = Parallel()(delayed(inc)(i) for i in range(10)) assert seq == [inc(i) for i in range(10)] from joblib.parallel import get_active_backend ba, _ = get_active_backend() ba.executor.shutdown()
def run_simulation(delta_reg, previous_params): config_vmc_file = import_config(sys.argv[1]) config_vmc_import = config_vmc_file.MC_parameters(int(sys.argv[2]), int(sys.argv[3]), rank) config_vmc = cv_module.MC_parameters(int(sys.argv[2]), int(sys.argv[3]), rank) config_vmc.__dict__ = config_vmc_import.__dict__.copy() print_model_summary(config_vmc) if previous_params is not None: config_vmc.initial_parameters = previous_params config_vmc.workdir = config_vmc.workdir + '/irrep_{:d}_Ne_{:d}/'.format( rank, int(sys.argv[3])) os.makedirs(config_vmc.workdir, exist_ok=True) with open(os.path.join(config_vmc.workdir, 'config.py'), 'w') as target, \ open(sys.argv[1], 'r') as source: # save config file to workdir (to remember!!) target.write(source.read()) if config_vmc.visualisation: # config_vmc.twist = [np.exp(2.0j * np.pi * 0.1904), np.exp(2.0j * np.pi * (0.1904 + 0.1))] visualisation.plot_levels_evolution_mu(config_vmc) visualisation.plot_all_waves(config_vmc) visualisation.plot_DOS(config_vmc) visualisation.plot_fermi_surface(config_vmc) visualisation.plot_all_waves(config_vmc) visualisation.plot_all_Jastrow(config_vmc) visualisation.plot_MF_spectrum_profile(config_vmc) config_vmc.twist = [ 1, 1 ] #[np.exp(2.0j * np.pi * 0.1904), np.exp(2.0j * np.pi * (0.1904 + 0.10))] if config_vmc.tests: if rank == 0: if tests.perform_all_tests(config_vmc): print('\033[92m All tests passed successfully \033[0m', flush=True) else: print('\033[91m Warning: some of the tests failed! \033[0m', flush=True) comm.Barrier() n_cpus_max = psutil.cpu_count(logical=True) print('max available CPUs:', n_cpus_max) n_cpus = config_vmc.n_cpus if config_vmc.n_cpus == -1: n_cpus = n_cpus_max print('performing simulation at', n_cpus, 'CPUs') ### generate twists once and for all (Sandro's suggestion) ### if config_vmc.twist_mesh == 'Baldereschi': print('Working with the Baldereschi mesh') if config_vmc.n_sublattices == 2: twists = [[ np.exp(2.0j * np.pi * 0.1904), np.exp(2.0j * np.pi * (0.1904 + 0.1)) ] for _ in range(config_vmc.n_chains)] if config_vmc.n_sublattices == 1: twists = [[1., -1.] for _ in range(config_vmc.n_chains)] # FIXME twists_per_cpu = config_vmc.n_chains / n_cpus elif config_vmc.twist_mesh == 'PBC': twists = [[1., 1.] for _ in range(config_vmc.n_chains)] twists_per_cpu = config_vmc.n_chains / n_cpus elif config_vmc.twist_mesh == 'APBCy': twists = [[1., -1.] for _ in range(config_vmc.n_chains)] twists_per_cpu = config_vmc.n_chains / n_cpus elif config_vmc.twist_mesh == 'reals': assert config_vmc.n_chains == 4 twists = [[1, 1], [1, -1], [-1, 1], [-1, -1]] twists_per_cpu = config_vmc.n_chains / n_cpus assert twists_per_cpu == 1 elif config_vmc.twist_mesh == 'uniform': L = config_vmc.L_twists_uniform twists = [] for i_x in range(L): for i_y in range(L): twists.append([ np.exp(1.0j * np.pi * (-1. + 1. / L + 2. * i_x / L)), np.exp(1.0j * np.pi * (-1. + 1. / L + 2. * i_y / L)) ]) twists_per_cpu = len(twists) // n_cpus if twists_per_cpu * n_cpus < len(twists): twists_per_cpu += 1 else: print('Twist {:s} is not supported'.format(config_vmc.twist_mesh)) exit(-1) print(twists) print( 'Number of twists: {:d}, number of chains {:d}, twists per cpu {:2f}'. format(len(twists), config_vmc.n_chains, twists_per_cpu)) K_matrices_up = [ models.apply_TBC(config_vmc, twist, deepcopy(config_vmc.K_0), inverse=False) for twist in twists ] print(repr(K_matrices_up[0])) print(config_vmc.K_0) #exit(-1) K_matrices_down = [ models.apply_TBC(config_vmc, twist, deepcopy(config_vmc.K_0).T, inverse=True) for twist in twists ] reg_terms = [models.apply_TBC(config_vmc, twist, deepcopy(config_vmc.reg_gap_term), inverse = False) * \ config_vmc.reg_gap_val for twist in twists] config_vmc.MC_chain = config_vmc.MC_chain // len( twists) # the MC_chain contains the total required number of samples pairings_list = config_vmc.pairings_list pairings_names = config_vmc.pairings_list_names # template = 'e_{:.2f}_Ne_{:d}'.format(config_vmc.epsilon, config_vmc.Ne) if config_vmc.PN_projection else \ # 'e_{:.2f}_mu_{:.2f}'.format(config_vmc.epsilon, config_vmc.mu) local_workdir = config_vmc.workdir obs_files = [] loaded_from_external = False if config_vmc.load_parameters: if config_vmc.load_parameters_path is not None: loaded_from_external = True filename = config_vmc.load_parameters_path parameters, last_step = load_parameters(filename) elif os.path.isfile(os.path.join(local_workdir, 'last_opt_params.p')): filename = os.path.join(local_workdir, 'last_opt_params.p') parameters, last_step = load_parameters(filename) else: parameters = config_vmc.initial_parameters last_step = 0 else: parameters = config_vmc.initial_parameters last_step = 0 if config_vmc.condensation_energy_check_regime: parameters[config_vmc.layout[:3].sum():config_vmc.layout[:4].sum( )] = 0. log_file = open(os.path.join(local_workdir, 'general_log.dat'), 'a+') force_file = open(os.path.join(local_workdir, 'force_log.dat'), 'a+') gaps_file = open(os.path.join(local_workdir, 'gaps_log.dat'), 'a+') force_SR_file = open(os.path.join(local_workdir, 'force_SR_log.dat'), 'a+') spectral_file = open(os.path.join(local_workdir, 'spectral_log.dat'), 'a+') final_states = [False] * len(twists) orbitals_in_use = [None] * len(twists) ### write log header only if we start from some random parameters ### if last_step == 0 or loaded_from_external: write_initial_logs(log_file, force_file, force_SR_file, config_vmc) #for n_step in range(last_step, config_vmc.optimisation_steps): n_step = last_step while n_step < config_vmc.optimisation_steps: t = time() if twists_per_cpu > 1: with parallel_backend("loky", inner_max_num_threads=1): results_batched = Parallel(n_jobs=n_cpus)(delayed(get_MC_chain_result)( \ n_step, \ deepcopy(config_vmc), \ pairings_list, \ parameters, \ twists = twists[i * twists_per_cpu:np.min([(i + 1) * twists_per_cpu, len(twists)])], \ final_states = final_states[i * twists_per_cpu:np.min([(i + 1) * twists_per_cpu, len(twists)])], \ orbitals_in_use = orbitals_in_use[i * twists_per_cpu:np.min([(i + 1) * twists_per_cpu, len(twists)])], \ K_matrices_up = K_matrices_up[i * twists_per_cpu:np.min([(i + 1) * twists_per_cpu, len(twists)])], \ K_matrices_down = K_matrices_down[i * twists_per_cpu:np.min([(i + 1) * twists_per_cpu, len(twists)])], \ regs = reg_terms[i * twists_per_cpu:np.min([(i + 1) * twists_per_cpu, len(twists)])], \ ) for i in range(n_cpus)) #for i in range(n_cpus): # print(i * twists_per_cpu, np.min([(i + 1) * twists_per_cpu, len(twists)])) results = [] for i, r in enumerate(results_batched): results = results + r print('obtained {:d} results from {:d} cpu'.format( len(r), i)) print('obtained in total {:d} results'.format(len(results))) #print(len(twists)) else: with parallel_backend("loky", inner_max_num_threads=1): results = Parallel(n_jobs=config_vmc.n_chains)(delayed(_get_MC_chain_result)( \ n_step, \ deepcopy(config_vmc), \ pairings_list, \ parameters, \ twists[i], \ final_states[i], \ orbitals_in_use[i], \ K_matrices_up[i], \ K_matrices_down[i], \ reg_terms[i], \ ) for i in range(config_vmc.n_chains)) print('MC chain generationof {:d} no {:d} took {:f}'.format( rank, n_step, time() - t)) t = time() ### print-out current energy levels ### E = results[0][7] spectral_file.write( str(n_step) + ' ' + ("{:.7f} " * len(E) + '\n').format(*E)) spectral_file.flush() ### MC chains data extraction ### gaps, gap, energies, mean_variance, Os, acceptance, \ final_states, densities, orbitals_in_use, occupied_numbers = \ extract_MC_data(results, config_vmc, config_vmc.n_chains) energies_merged = np.concatenate(energies) n_above_FS = len( np.setdiff1d(occupied_numbers[0], np.arange(config_vmc.total_dof // 2))) ### gradient step ### if config_vmc.generator_mode: # evolve parameters only if it's necessary mask = np.ones(np.sum(config_vmc.layout)) factor_stages = 1 if n_step < 100: # jastrows and mu_BCS have not converged yet mask = np.zeros(np.sum(config_vmc.layout)) mask[-config_vmc.layout[4]:] = 1. factor_stages = 30. # # mask[:config_vmc.layout[0]] = 1. # mask[config_vmc.layout[0] + config_vmc.layout[1] + config_vmc.layout[2]:config_vmc.layout[0] + \ # config_vmc.layout[1] + config_vmc.layout[2] + config_vmc.layout[3]] = 0. #mask[1] = 0.0 # fugacity is not optimized in the meantime # Os = [np.einsum('ik,k->ik', Os_theta, config_vmc.mask) for Os_theta in Os] step, forces = make_SR_step(Os, energies, config_vmc, twists, gaps, n_step, mask) write_intermediate_log(log_file, force_file, force_SR_file, n_step, config_vmc.total_dof // 2, energies, densities, \ mean_variance, acceptance, forces, step, gap, n_above_FS, parameters) # write parameters before step not to lose the initial values write_gaps_log(gaps_file, gaps, n_step) #if np.abs(gap) < 1e-4: # if the gap is too small, SR will make gradient just 0 # step = forces #step = forces * config_vmc.opt_parameters[1] step = step * config_vmc.opt_parameters[1] #step = clip_forces(config_vmc.all_clips, step) parameters += step * mask * factor_stages # lr better be ~0.01..0.1 #if parameters[0] < config_vmc.mu_BCS_min: # parameters[0] = config_vmc.mu_BCS_min #if parameters[0] > config_vmc.mu_BCS_max: # parameters[0] = config_vmc.mu_BCS_max save_parameters(parameters, config_vmc.workdir, n_step) ### END SR STEP ### observables = np.concatenate([np.array(x[4]) for x in results], axis=0) observables_names = results[0][5] n_step += 1 if len(observables_names) == 0: continue if n_step == config_vmc.thermalization + 1: create_obs_files(observables_names, config_vmc) write_observables(n_step, obs_files, observables, config_vmc) if rank == 0: print('SR and logging {:d} took {:f}'.format(n_step, time() - t)) log_file.close() force_file.close() force_SR_file.close() spectral_file.close() [file.close() for file in obs_files] return parameters
from pyitab.analysis.searchlight import SearchLight from sklearn.model_selection import * from pyitab.utils import load_test_dataset import joblib from dask.distributed import Client from dask_kubernetes import KubeCluster cluster = KubeCluster.from_yaml('pods.yml') pods = cluster.scale(6) client = Client(cluster.scheduler_address) ds = load_test_dataset() cv = KFold() with joblib.parallel_backend('dask', scatter=[ds]): scores = SearchLight(cv=cv).fit(ds)
def handle_call(func, args, kwargs, proxy_positions=[]): if len(proxy_positions) > 0: args, kwargs = replace_with_values(args, kwargs, proxy_positions) with parallel_backend('sequential'): return func(*args, **kwargs)
c = Client(cluster) # Query the client for all connected workers workers = c.has_what().keys() n_workers = len(workers) df = cudf.read_csv(os.path.join(data_dir, "train.csv")) N_TRIALS = 5 # Drop non-numerical data and fill NaNs before passing to cuML RF CAT_COLS = list(df.select_dtypes('object').columns) df = df.drop(CAT_COLS, axis=1) df = df.dropna() df = df.astype("float32") X, y = df.drop(["target"], axis=1), df["target"].astype('int32') study_name = "dask_optuna_lr_log_loss_tpe" storage_name = "sqlite:///study_stores.db" storage = dask_optuna.DaskStorage(storage_name) study = optuna.create_study(sampler=optuna.samplers.TPESampler(), study_name=study_name, direction="minimize", storage=storage) # Optimize in parallel on your Dask cluster with parallel_backend("dask"): study.optimize(lambda trial: objective(trial, X, y), n_trials=N_TRIALS, n_jobs=n_workers) print('Best params{} and best score{}'.format(study.best_params, study.best_value))
def main(): unsupervised_models = ['OCSVM', 'IF'] is_supervised = args.model not in unsupervised_models # outliers = args.outliers / 100 outliers = 0.05 in_class = 1.0 - outliers pipeline = Pipeline([ ('std', None), ('clf', None), ]) if args.model == 'SVC': params = [ { 'std': [MinMaxScaler(), StandardScaler(), None], 'clf': [SVC()], 'clf__kernel': ['rbf', 'poly'], 'clf__C': [1e-2, 1e-1, 1e0, 1e1, 1e2, 1e3], 'clf__gamma': ['scale'], }, ] elif args.model == 'LSVC': params = [ { 'std': [MinMaxScaler(), StandardScaler(), None], 'clf': [LinearSVC()], 'clf__C': [1e-2, 1e-1, 1e0, 1e1, 1e2, 1e3], 'clf__max_iter': [100000], }, ] elif args.model == 'LR': params = [ { 'std': [MinMaxScaler(), StandardScaler(), None], 'clf': [LogisticRegression()], }, ] elif args.model == 'RF': params = [ { 'std': [MinMaxScaler(), StandardScaler(), None], 'clf': [RandomForestClassifier()], 'clf__n_estimators': [ 500, ], 'clf__max_depth': [None, 2, 8, 16], 'clf__min_samples_split': [2, 0.1, 0.5], 'clf__max_features': ['sqrt', 'log2'], }, ] elif args.model == 'GB': params = [ { 'std': [MinMaxScaler(), StandardScaler(), None], 'clf': [GradientBoostingClassifier()], 'clf__loss': ['deviance', 'exponential'], 'clf__learning_rate': [0.5, 0.1, 0.01, 0.001], 'clf__n_estimators': [32, 100, 200, 500], 'clf__max_depth': [2, 4, 8, 16], 'clf__min_samples_split': [2, 0.1, 0.5], }, ] elif args.model == 'IF': params = [ { 'std': [MinMaxScaler(), StandardScaler(), None], 'clf': [IsolationForest()], 'clf__n_estimators': [20, 50, 100, 200], 'clf__contamination': [outliers, outliers + 0.025, outliers - 0.025], 'clf__max_samples': ['auto', 0.1], 'clf__bootstrap': [True, False], 'clf__behaviour': [True], }, ] elif args.model == 'OCSVM': params = [ { 'std': [MinMaxScaler(), StandardScaler(), None], 'clf': [OneClassSVM()], 'clf__kernel': ['rbf', 'poly', 'linear'], 'clf__gamma': ['scale', 'auto'], 'clf__nu': [outliers / 2, outliers, outliers * 2], }, ] run_dir_name = args.run_dir name_prefix = '' name_prefix += 'ablation_re_' if args.only_re else '' name_prefix += 'ablation_ln_' if args.only_ln else '' assert args.only_re or args.only_ln chosen_ind = 0 if args.only_re else 1 dataset_names = ['train'] for adv_type in adv_types: dataset_names.append(f'clean_{adv_type}') dataset_names.append(f'adv_{adv_type}') dataset_names.append(f'noisy_{adv_type}') # TODO results dict, aggregate and print mean and stddev to a new file results = {} for run_n in range(args.runs): results[run_n] = {} run_dir = Path(f'{run_dir_name}_{run_n}') run_name = str(run_dir) assert run_dir.exists() datasets = {} for name in dataset_names: dataset_path = run_dir / f'ae_encoded_{name}.npy' if dataset_path.exists(): loaded = np.load(str(dataset_path)) print(f'loaded.shape: {loaded.shape}') # TODO filtered = loaded[:, chosen_ind::2] print(f'filtered.shape: {filtered.shape}') datasets[name] = filtered else: print(f'{dataset_path} is missing!') # for supervised we consider two setups - "known attack" (left half of table 3 from "A Simple Unified Framework...") # and "unknown attack" where we train only on FGSM and validate on the rest # for unsupervised we train on entire training data(! - change if needed) and test on clean/adv/noisy if is_supervised: # "known" part results_filename = f'{run_name}/{name_prefix}results_{args.model}_known.txt' with open(results_filename, 'w') as results_file: for adv_type in adv_types: model_filename = f'{run_name}/{name_prefix}final_cv_{args.model}_known_{adv_type}.joblib' train_split = 0.1 train_size = int(train_split * len(datasets[f'clean_{adv_type}'])) test_size = len(datasets[f'clean_{adv_type}']) - train_size X = np.concatenate([ datasets[f'clean_{adv_type}'][:train_size], datasets[f'adv_{adv_type}'][:train_size], datasets[f'noisy_{adv_type}'][:train_size], ]) y = np.concatenate([ np.ones(train_size), np.zeros(train_size), np.ones(train_size), ]) X_test = np.concatenate([ datasets[f'clean_{adv_type}'][train_size:], datasets[f'adv_{adv_type}'][train_size:], datasets[f'noisy_{adv_type}'][train_size:], ]) y_test = np.concatenate([ np.ones(test_size), np.zeros(test_size), np.ones(test_size), ]) if not Path(model_filename).exists(): # train with parallel_backend('loky', n_jobs=args.jobs): gs = GridSearchCV(pipeline, params, scoring=make_scorer( roc_auc_score, needs_threshold=True), cv=StratifiedKFold(5), verbose=1) gs.fit(X, y) # save model joblib.dump(gs, model_filename) else: gs = joblib.load(model_filename) print(f'Best params on {adv_type}: {gs.best_params_}', file=results_file) # print feature importance on Random Forest if args.model == 'RF': rf = gs.best_estimator_['clf'] print( f'RF feature importance for {adv_type}: \n {rf.feature_importances_.tolist()}', file=results_file) # validate y_pred = gs.predict(X_test) try: y_scores = gs.decision_function(X_test) except: y_scores = gs.predict_proba(X_test) if y_scores.ndim > 1: y_scores = y_scores[:, 1] acc = accuracy_score(y_test, y_pred) auroc = roc_auc_score(y_test, y_scores) print(f'Accuracy on {adv_type}: {acc}', file=results_file) results[run_n][f'acc_known_{adv_type}'] = acc print(f'AUROC on {adv_type}: {auroc}', file=results_file) results[run_n][f'auroc_known_{adv_type}'] = auroc # "unknown/FGSM" part results_filename = f'{run_name}/{name_prefix}results_{args.model}_unknown.txt' with open(results_filename, 'w') as results_file: model_filename = f'{run_name}/{name_prefix}final_cv_{args.model}_unknown.joblib' # train on FGSM train_split = 0.1 train_size = int(train_split * len(datasets[f'clean_{adv_types[0]}'])) test_size = len(datasets[f'clean_{adv_types[0]}']) - train_size X = np.concatenate([ datasets[f'clean_{adv_types[0]}'][:train_size], datasets[f'adv_{adv_types[0]}'][:train_size], datasets[f'noisy_{adv_types[0]}'][:train_size], ]) y = np.concatenate([ np.ones(train_size), np.zeros(train_size), np.ones(train_size), ]) X_test = np.concatenate([ datasets[f'clean_{adv_types[0]}'][train_size:], datasets[f'adv_{adv_types[0]}'][train_size:], datasets[f'noisy_{adv_types[0]}'][train_size:], ]) y_test = np.concatenate([ np.ones(test_size), np.zeros(test_size), np.ones(test_size), ]) if not Path(model_filename).exists(): # train with parallel_backend('loky', n_jobs=args.jobs): gs = GridSearchCV(pipeline, params, scoring=make_scorer( roc_auc_score, needs_threshold=True), cv=StratifiedKFold(5), verbose=1) gs.fit(X, y) # save model joblib.dump(gs, model_filename) else: gs = joblib.load(model_filename) print(f'Best params: {gs.best_params_}', file=results_file) # print feature importance on Random Forest if args.model == 'RF': rf = gs.best_estimator_['clf'] print( f'RF feature importance: \n {rf.feature_importances_.tolist()}', file=results_file) # test y_pred = gs.predict(X_test) try: y_scores = gs.decision_function(X_test) except: y_scores = gs.predict_proba(X_test) if y_scores.ndim > 1: y_scores = y_scores[:, 1] acc = accuracy_score(y_test, y_pred) auroc = roc_auc_score(y_test, y_scores) print(f'Accuracy on {adv_types[0]}: {acc}', file=results_file) results[run_n][f'acc_unknown_{adv_types[0]}'] = acc print(f'AUROC on {adv_types[0]}: {auroc}', file=results_file) results[run_n][f'auroc_unknown_{adv_types[0]}'] = auroc # and test on the rest for adv_type in adv_types[1:]: test_size = len(datasets[f'clean_{adv_type}']) X_test = np.concatenate([ datasets[f'clean_{adv_type}'], datasets[f'adv_{adv_type}'], datasets[f'noisy_{adv_type}'], ]) y_test = np.concatenate([ np.ones(test_size), np.zeros(test_size), np.ones(test_size), ]) # validate y_pred = gs.predict(X_test) try: y_scores = gs.decision_function(X_test) except: y_scores = gs.predict_proba(X_test) if y_scores.ndim > 1: y_scores = y_scores[:, 1] acc = accuracy_score(y_test, y_pred) auroc = roc_auc_score(y_test, y_scores) print(f'Accuracy on {adv_type}: {acc}', file=results_file) results[run_n][f'acc_unknown_{adv_type}'] = acc print(f'AUROC on {adv_type}: {auroc}', file=results_file) results[run_n][f'auroc_unknown_{adv_type}'] = auroc else: model_filename = f'{run_name}/{name_prefix}final_cv_{args.model}.joblib' results_filename = f'{run_name}/{name_prefix}results_{args.model}.txt' if not Path(model_filename).exists(): # use only train dataset for one-class classifiers X = datasets[f'train'] train_size = len(X) y = np.ones(train_size) with parallel_backend('loky', n_jobs=args.jobs): gs = GridSearchCV(pipeline, params, scoring=make_scorer( score_func, greater_is_better=False), cv=5, verbose=1) gs.fit(X, y) # save model joblib.dump(gs, model_filename) else: gs = joblib.load(model_filename) # evaluate for adv_type in adv_types: test_size = len(datasets[f'clean_{adv_type}']) X_test = np.concatenate([ datasets[f'clean_{adv_type}'], datasets[f'adv_{adv_type}'], datasets[f'noisy_{adv_type}'], ]) y_test = np.concatenate([ np.ones(test_size), np.zeros(test_size), np.ones(test_size), ]) y_pred = gs.predict(X_test) try: y_scores = gs.decision_function(X_test) except: y_scores = gs.predict_proba(X_test)[0] acc = accuracy_score(y_test, y_pred) auroc = roc_auc_score(y_test, y_scores) results[run_n][f'acc_{adv_type}'] = acc results[run_n][f'auroc_{adv_type}'] = auroc # save results with open(results_filename, 'w') as results_file: print(f'Best score: {gs.best_score_}', file=results_file) print(f'Best params: {gs.best_params_}', file=results_file) for adv_type in adv_types: print( f"Accuracy on {adv_type}: {results[run_n][f'acc_{adv_type}']}", file=results_file) print( f"AUROC on {adv_type}: {results[run_n][f'auroc_{adv_type}']}", file=results_file) results_filename = f'{name_prefix}{run_dir_name}_{args.model}.txt' with open(results_filename, 'w') as results_file: for adv_type in adv_types: if is_supervised: # known res = np.array([ results[i][f'acc_known_{adv_type}'] for i in range(args.runs) ]) print( f'Acc on {adv_type}(known): {res.mean()} +/- {res.std(ddof=1)}', file=results_file) res = np.array([ results[i][f'auroc_known_{adv_type}'] for i in range(args.runs) ]) print( f'AUROC on {adv_type}(known): {res.mean()} +/- {res.std(ddof=1)}', file=results_file) # unknown res = np.array([ results[i][f'acc_unknown_{adv_type}'] for i in range(args.runs) ]) print( f'Acc on {adv_type}(unknown): {res.mean()} +/- {res.std(ddof=1)}', file=results_file) res = np.array([ results[i][f'auroc_unknown_{adv_type}'] for i in range(args.runs) ]) print( f'AUROC on {adv_type}(unknown): {res.mean()} +/- {res.std(ddof=1)}', file=results_file) else: res = np.array( [results[i][f'acc_{adv_type}'] for i in range(args.runs)]) print(f'Acc on {adv_type}: {res.mean()} +/- {res.std(ddof=1)}', file=results_file) res = np.array([ results[i][f'auroc_{adv_type}'] for i in range(args.runs) ]) print( f'AUROC on {adv_type}: {res.mean()} +/- {res.std(ddof=1)}', file=results_file)
import os os.environ['SKLEARN_SITE_JOBLIB'] = "1" from dask.distributed import Client from sklearn import datasets, linear_model from sklearn.model_selection import cross_validate import joblib client = Client(processes=False) joblib.parallel_backend('dask') diabetes = datasets.load_diabetes() X = diabetes.data[:50] y = diabetes.target[:50] model = linear_model.LinearRegression() cv_results = cross_validate(model, X, y, cv=10, return_train_score=False, verbose=100) ################################################################## from dask_kubernetes import KubeCluster from dask.distributed import Client import os os.environ['SKLEARN_SITE_JOBLIB'] = "1" from dask.distributed import Client from sklearn import datasets, linear_model from sklearn.model_selection import cross_validate
def main(): print(args) pipeline = Pipeline([ ("std", None), ("dim_red", None), ("clf", None), ]) n_components_gs = [3, 5, 15] if args.model == "SVC": SVM = SVC params = [ { "std": [StandardScaler()] if args.pca else [MinMaxScaler(), StandardScaler(), None], "dim_red": [PCA()] if args.pca else [None], "clf": [SVM()], "clf__kernel": ["rbf", "poly"], "clf__C": [1e-2, 1e-1, 1e0, 1e1, 1e2, 1e3], "clf__gamma": ["scale"], }, ] if args.pca: params[0]["dim_red__n_components"] = n_components_gs elif args.model == "LSVC": LinearSVM = LinearSVC params = [ { "std": [MinMaxScaler(), StandardScaler(), None], "dim_red": [PCA()] if args.pca else [None], "clf": [LinearSVM()], "clf__C": [1e-2, 1e-1, 1e0, 1e1, 1e2, 1e3], "clf__max_iter": [100000], }, ] if args.pca: params[0]["dim_red__n_components"] = n_components_gs elif args.model == "LR": LR = LogisticRegression params = [ { "std": [MinMaxScaler(), StandardScaler(), None], "dim_red": [PCA()] if args.pca else [None], "clf": [LR()], }, ] if args.pca: params[0]["dim_red__n_components"] = n_components_gs elif args.model == 'LASSO': lasso = SGDClassifier loss = '' params = [ { 'std': [MinMaxScaler(), StandardScaler(), None], 'dim_red': [PCA()] if args.pca else [None], 'clf': [lasso()], 'clf__loss': ['squared_loss'], 'clf__penalty': ['l1'], 'clf__alpha': [1e-3, 1e-2, 1e-1, 1e0, 1e1, 1e2, 1e3], }, ] if args.pca: params[0]['dim_red__n_components'] = n_components_gs elif args.model == "RF": RF = RandomForestClassifier params = [ { "std": [MinMaxScaler(), StandardScaler(), None], "dim_red": [PCA()] if args.pca else [None], "clf": [RF()], "clf__n_estimators": [ 500, ], "clf__max_depth": [None, 2, 8, 16], "clf__min_samples_split": [2, 0.1, 0.5], "clf__max_features": ["sqrt", "log2"], }, ] if args.pca: params[0]["dim_red__n_components"] = n_components_gs elif args.model == "GB": GB = GradientBoostingClassifier loss_list = ["deviance", "exponential"] params = [ { "std": [MinMaxScaler(), StandardScaler(), None], "dim_red": [PCA()] if args.pca else [None], "clf": [GB()], "clf__loss": loss_list, "clf__learning_rate": [0.1, 0.01, 0.001], "clf__n_estimators": [32, 100, 500], "clf__max_depth": [2, 8, 16], "clf__min_samples_split": [2, 0.1, 0.5], }, ] if args.pca: params[0]["dim_red__n_components"] = n_components_gs X, y = read_static_data(args.data, args.labels, args.include, skip_control=not args.with_control) print(f"X shape: {X.shape} y shape: {y.shape}") X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=42, stratify=y) print(f'trainset size: {X_train.shape[0]}') results_dir = args.results_dir results_dir.mkdir(parents=True, exist_ok=True) included = set(args.include) included = ("all" if included == {"FC", "REHO", "ALFF", "fALFF"} else "_".join(sorted(included))) filename_infix = f"{included}{'_pca' if args.pca else ''}_{args.model}" model_path = results_dir / f"model_{filename_infix}.joblib" results_path = results_dir / f"results_{filename_infix}.txt" importances_path = results_dir / f"importances_{filename_infix}.csv" roc_fpr_path = results_dir / f"roc_fpr_{filename_infix}.csv" roc_tpr_path = results_dir / f"roc_tpr_{filename_infix}.csv" roc_thr_path = results_dir / f"roc_thr_{filename_infix}.csv" with parallel_backend("loky", n_jobs=args.jobs), open(results_path, "w") as results_file: folding = StratifiedKFold(5) scoring = make_scorer(roc_auc_score, needs_threshold=True) if not model_path.exists(): gs = GridSearchCV(pipeline, params, scoring=scoring, cv=folding, verbose=1) gs.fit(X_train, y_train) # save model joblib.dump(gs, model_path.resolve()) else: gs = joblib.load(model_path.resolve()) print(f"Best params: {gs.best_params_}", file=results_file) # print variance explained for PCA if args.pca: pca = gs.best_estimator_["dim_red"] print( f"PCA variance explained: \n {pca.explained_variance_ratio_.tolist()}", file=results_file, ) # print feature importance on Random Forest if args.model == "RF": rf = gs.best_estimator_["clf"] importances = rf.feature_importances_ if args.pca: importances = pca.inverse_transform(importances) importances = np.absolute(importances) importances /= np.sum(importances) np.savetxt(importances_path, [importances], delimiter=',') elif args.model == "LR" or args.model == "LASSO": model = gs.best_estimator_["clf"] importances = model.coef_ if args.pca: importances = pca.inverse_transform(importances) importances = np.absolute(importances) importances /= np.sum(importances) np.savetxt(importances_path, importances, delimiter=',') # validate y_test_pred = gs.predict(X_test) if hasattr(gs, 'decision_function'): y_scores = gs.decision_function(X_test) elif hasattr(gs, 'predict_proba'): y_scores = gs.predict_proba(X_test) if y_scores.ndim > 1: y_scores = y_scores[:, 1] else: y_scores = None acc = accuracy_score(y_test, y_test_pred) tn, fp, fn, tp = confusion_matrix(y_test, y_test_pred).ravel() specificity = tn / (tn + fp) precision = tp / (tp + fp) print(f"Accuracy: {acc}", file=results_file) print(f"Specificity: {specificity}", file=results_file) print(f"Precision: {precision}", file=results_file) if y_scores is not None: auroc = roc_auc_score(y_test, y_scores) print(f"AUROC: {auroc}", file=results_file) fprs, tprs, thrhlds = roc_curve(y_test, y_scores) np.savetxt(roc_fpr_path, fprs, delimiter=',') np.savetxt(roc_tpr_path, tprs, delimiter=',') np.savetxt(roc_thr_path, thrhlds, delimiter=',') # validation score print(f"Validation AUROC: {gs.best_score_}", file=results_file) # test on trainset y_train_pred = gs.predict(X_train) if hasattr(gs, 'decision_function'): y_train_scores = gs.decision_function(X_train) elif hasattr(gs, 'predict_proba'): y_train_scores = gs.predict_proba(X_train) if y_train_scores.ndim > 1: y_train_scores = y_train_scores[:, 1] else: y_train_scores = None train_acc = accuracy_score(y_train, y_train_pred) print(f"Train accuracy: {train_acc}", file=results_file) if y_train_scores is not None: train_auroc = roc_auc_score(y_train, y_train_scores) print(f"Train AUROC: {train_auroc}", file=results_file) # calculate p-values best_estimator = gs.best_estimator_ pvalue, permutation_scores = p_value_permute(best_estimator, auroc, scoring, X_train, y_train, X_test, y_test) print(f"Test p-value: {pvalue}", file=results_file) # print(f"Test permutation scores: {permutation_scores}", file=results_file) if args.shell: IPython.embed()
def dask_clusterMethod(self, cluster_method, mname, data): try: logger.info('[{}] : [INFO] Loading Clustering method {}'.format( datetime.fromtimestamp( time.time()).strftime('%Y-%m-%d %H:%M:%S'), type(cluster_method))) # delattr(cluster_method, 'behaviour') # del cluster_method.__dict__['behaviour'] for k, v in cluster_method.get_params().items(): logger.info( '[{}] : [INFO] Method parameter {} set to {}'.format( datetime.fromtimestamp( time.time()).strftime('%Y-%m-%d %H:%M:%S'), k, v)) try: with joblib.parallel_backend('dask'): logger.info( '[{}] : [INFO] Using Dask backend for user defined method' .format( datetime.fromtimestamp( time.time()).strftime('%Y-%m-%d %H:%M:%S'))) clf = cluster_method.fit(data) except Exception as inst: logger.error( '[{}] : [ERROR] Failed to fit user defined method with dask backend with {} and {}' .format( datetime.fromtimestamp( time.time()).strftime('%Y-%m-%d %H:%M:%S'), type(inst), inst.args)) logger.warning( '[{}] : [WARN] using default process based backend for user defined method' .format( datetime.fromtimestamp( time.time()).strftime('%Y-%m-%d %H:%M:%S'))) clf = cluster_method.fit(data) except Exception as inst: logger.error( '[{}] : [ERROR] Failed to fit {} with {} and {}'.format( datetime.fromtimestamp( time.time()).strftime('%Y-%m-%d %H:%M:%S'), type(cluster_method), type(inst), inst.args)) sys.exit(1) predictions = clf.predict(data) if list(np.unique(predictions)) == [0, 1]: anomaly_marker = 1 normal_marker = 0 else: anomaly_marker = -1 normal_marker = 1 logger.info( '[{}] : [INFO] Number of Predicted Anomalies {} from a total of {} datapoints.' .format( datetime.fromtimestamp( time.time()).strftime('%Y-%m-%d %H:%M:%S'), list(predictions).count(anomaly_marker), len(list(predictions)))) logger.debug('[{}] : [DEBUG] Predicted Anomaly Array {}'.format( datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), predictions)) fname = str(clf).split('(')[0] self.__serializemodel(clf, fname, mname) self.__plot_feature_sep(data, predictions, method=fname, mname=mname, anomaly_label=anomaly_marker, normal_label=normal_marker) self.__decision_boundary(clf, data, method=fname, mname=mname, anomaly_label=anomaly_marker) return clf
def dask_isolationForest(self, settings, mname, data): ''' :param settings: -> settings dictionary :param mname: -> name of serialized clusterer :param scaler: -> scaler to use on data :return: -> isolation forest instance :example settings: -> {n_estimators:100, max_samples:100, contamination:0.1, bootstrap:False, max_features:1.0, n_jobs:1, random_state:None, verbose:0} ''' if not settings or settings is None: logger.warning( '[{}] : [WARN] No IsolationForest parameters defined using defaults' .format( datetime.fromtimestamp( time.time()).strftime('%Y-%m-%d %H:%M:%S'))) # print(settings) settings = {} else: for k, v in settings.items(): logger.info( '[{}] : [INFO] IsolationForest parameter {} set to {}'. format( datetime.fromtimestamp( time.time()).strftime('%Y-%m-%d %H:%M:%S'), k, v)) try: clf = IsolationForest(**settings) # print(clf) except Exception as inst: logger.error( '[{}] : [INFO] Failed to instanciate IsolationForest with {} and {}' .format( datetime.fromtimestamp( time.time()).strftime('%Y-%m-%d %H:%M:%S'), type(inst), inst.args)) sys.exit(1) try: with joblib.parallel_backend('dask'): logger.info( '[{}] : [INFO] Using Dask backend for IsolationForest'. format( datetime.fromtimestamp( time.time()).strftime('%Y-%m-%d %H:%M:%S'))) clf.fit(data) except Exception as inst: logger.error( '[{}] : [ERROR] Failed to fit IsolationForest with {} and {}'. format( datetime.fromtimestamp( time.time()).strftime('%Y-%m-%d %H:%M:%S'), type(inst), inst.args)) sys.exit(1) predict = clf.predict(data) anoOnly = np.argwhere(predict == -1) logger.info( '[{}] : [INFO] Found {} anomalies in training dataset of shape {}.' .format( datetime.fromtimestamp( time.time()).strftime('%Y-%m-%d %H:%M:%S'), len(anoOnly), data.shape)) logger.info('[{}] : [DEBUG] Predicted Anomaly Array {}'.format( datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), predict)) self.__serializemodel(clf, 'isoforest', mname) self.__appendPredictions(method='isoforest', mname=mname, data=data, pred=predict)
fit_data = namedtuple( "FitData", ( "endog_data", "exog_data", "master_mask", "filled_datasets", "masked_datasets", "land_mask", ), ) n_jobs = 5 with parallel_backend("loky", n_jobs=n_jobs, inner_max_num_threads=math.floor(get_ncpus() / n_jobs)): outputs = thres_fire_season_stats(0.1) fire_season_mask = [out for out in outputs if out[0] == ba_dataset][0][4] shift_months = [1, 3, 6, 12, 24] selection_variables = ( "VOD Ku-band -3 Month", "SIF", "VOD Ku-band -1 Month", "Dry Day Period -3 Month", "FAPAR", "pftHerb", "LAI -1 Month",
def training_loop(self) -> None: register_ray() self.estimator.set_params(**self.params) datasets = self._get_datasets() X_train, y_train = datasets.pop(TRAIN_DATASET_KEY) groups = None if "cv_groups" in X_train.columns: groups = X_train["cv_groups"] X_train = X_train.drop("cv_groups", axis=1) scaling_config_dataclass = self._validate_and_get_scaling_config_data_class( self.scaling_config ) num_workers = scaling_config_dataclass.num_workers or 0 assert num_workers == 0 # num_workers is not in scaling config allowed_keys trainer_resources = scaling_config_dataclass.trainer_resources or {"CPU": 1} has_gpus = bool(trainer_resources.get("GPU", 0)) num_cpus = int(trainer_resources.get("CPU", 1)) # see https://scikit-learn.org/stable/computing/parallelism.html os.environ["OMP_NUM_THREADS"] = str(num_cpus) os.environ["MKL_NUM_THREADS"] = str(num_cpus) os.environ["OPENBLAS_NUM_THREADS"] = str(num_cpus) os.environ["BLIS_NUM_THREADS"] = str(num_cpus) parallelize_cv = self._get_cv_parallelism(has_gpus) if self.set_estimator_cpus: num_estimator_cpus = 1 if parallelize_cv else num_cpus _set_cpu_params(self.estimator, num_estimator_cpus) with parallel_backend("ray", n_jobs=num_cpus): start_time = time() self.estimator.fit(X_train, y_train, **self.fit_params) fit_time = time() - start_time with tune.checkpoint_dir(step=1) as checkpoint_dir: with open(os.path.join(checkpoint_dir, MODEL_KEY), "wb") as f: cpickle.dump(self.estimator, f) if self.preprocessor: save_preprocessor_to_dir(self.preprocessor, checkpoint_dir) if self.label_column: validation_set_scores = self._score_on_validation_sets( self.estimator, datasets ) cv_scores = self._score_cv( self.estimator, X_train, y_train, groups, # if estimator has parallelism, use that. Otherwise, # parallelize CV n_jobs=1 if not parallelize_cv else num_cpus, ) else: validation_set_scores = {} cv_scores = {} # cv_scores will not override validation_set_scores as we # check for that during initialization results = { **validation_set_scores, **cv_scores, "fit_time": fit_time, } tune.report(**results)
def train(self, df_model_train, model_name, df_model_valid=None, weight=None): model_q, evals_result_q = {}, {} if 'mean' in self.regression_params['type']: num_rounds, early_stopping = self.determine_num_rounds( df_model_train, model_name, objective='mean', weight=weight) # Train model for mean model, evals_result = self.create_fit_model( model_name, df_model_train, objective='mean', df_model_valid=df_model_valid, weight=weight, num_rounds=num_rounds, early_stopping=early_stopping) model_q['mean'] = model evals_result_q['mean'] = evals_result if 'quantile' in self.regression_params['type']: num_rounds, early_stopping = self.determine_num_rounds( df_model_train, model_name, objective='quantile', weight=weight) # Train models for different quantiles with joblib.parallel_backend(self.parallel_processing['backend']): results = joblib.Parallel( n_jobs=self.parallel_processing['n_workers'])( joblib.delayed(self.create_fit_model)( model_name, df_model_train, objective='quantile', alpha=alpha, df_model_valid=df_model_valid, weight=weight, num_rounds=num_rounds, early_stopping=early_stopping) for alpha in self.alpha_q) for (model, evals_result), alpha in zip(results, self.alpha_q): model_q['quantile{0:.2f}'.format(alpha)] = model evals_result_q['quantile{0:.2f}'.format(alpha)] = evals_result if not (('mean' in self.regression_params['type']) or ('quantile' in self.regression_params['type'])): raise ValueError( 'Value of regression parameter "objective" not recognized.') # Convert evals_result_q to dataframe data = {(level1_key, level2_key): pd.Series(values) for level1_key in evals_result_q.keys() for level2_key, values in evals_result_q[level1_key].items()} df_evals_result_q = pd.DataFrame(data) df_evals_result_q.index.name = 'iterations' return model_q, df_evals_result_q
#---# def f(i): return i * 10 #---# trials = 100 #---# print(trials) with parallel_backend('loky', n_jobs=2): lst = Parallel()(delayed(f)(i) for i in range(trials)) #chk> print(lst) #---# lst2 = [2 * i for i in range(trials)] #---# print(lst2)
if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("-n_jobs", type=int, required=True) parser.add_argument("-num_summaries", type=int, required=True) parser.add_argument("-in_dir", type=str, required=True) parser.add_argument("-out_dir", type=str, required=True) args = parser.parse_args() assert os.path.isdir(args.in_dir) assert os.path.isdir(args.out_dir) # Read summary files in parallel input_fnames = listdir_fullpath(args.in_dir) with parallel_backend('multiprocessing', n_jobs=args.n_jobs): all_summaries = Parallel()(delayed(read_out_file)(idx, fname) for idx, fname in enumerate(input_fnames)) # sort summaries according to document number all_summaries = sorted(all_summaries, key=lambda x: x[0]) all_summaries = [tup[1] for tup in all_summaries] with parallel_backend('multiprocessing', n_jobs=args.n_jobs): unique_summaries = Parallel()( delayed(remove_duplicates)(idx, summaries) for idx, summaries in enumerate(all_summaries)) output_fnames = [ args.out_dir + "/out_{}.txt".format(i) for i in range(args.num_summaries) ] write_to_files(unique_summaries, output_fnames)
AID_path = os.path.join(r'C:\Users\gdrei\Dropbox\UCL\Thesis\Data', AID) else: AID_path = os.path.join('/home/gabriel/Dropbox/UCL/Thesis/Data/', AID) save_path = AID_path + '/' + AID + 'graph_processed.pkl' pickle_off = open(save_path, 'rb') activity_table = pickle.load(pickle_off) pickle_off.close() '''Pick diverse starting point of 10% of library''' fplist = [x for x in activity_table['bit_MFP']] '''start_indexs holds the indexes of molecules already scanned at the start of each iteration. So for the first iter it hold the diversity selection. For the second, it holds both the diversity selection and the molecules screened based on the results of the first training iteration etc''' #build up metalists that will vary for each repetition start_num = int(len(fplist) * 0.15) with parallel_backend('multiprocessing'): start_index_metalist = Parallel(n_jobs=3)( delayed(getNextIterInds)( firstPicksList=i, fplist=j, bottom_to_select=k) for i, j, k in zip([[], [], []], [fplist, fplist, fplist], [start_num, start_num, start_num])) # start_indexs = np.array(mmp.LazyBitVectorPick(fplist,len(fplist),int(len(fplist)/10))) '''store in a list that will vary as each model makes its predictions''' for rep_num in range(3): #set rep specific variables metric_dict_list = metric_dict_metalist[rep_num] start_indexs = start_index_metalist[rep_num] multi_dump_path = os.path.join( '/home/gabriel/Dropbox/UCL/Thesis/Data/', exper_file_name + str(rep_num) + '.pkl') #everything from here forwards doesn't need to change
def joblib_process(sub_f, *args): result = [] with parallel_backend('multiprocessing', n_jobs=-1): res = Parallel()(delayed(sub_f)(*[img, args[0], args[1]]) for img in args[0]) return res
def cal_factors(self, start, end, n_jobs): # type要用408001000,408005000,408004000(合并报表,合并更正前,合并调整后),同时有408001000和408005000用408005000 # 有408004000时,根据ann_dt酌情使用 # 目前包含字段: 净利润(net_profit),扣非净利润(net_profit_ddt),营收(oper_rev),总营收(tot_oper_rev), # 营业利润(oper_profit),摊薄eps(EPS_diluted),经营利润(oper_income),少数股东损益(minority_int_inc), # 财务费用(less_fin_exp),利息净收入(net_int_inc), 息税前利润(EBIT),报告类型(statement_type) query = "select ANN_DT, S_INFO_WINDCODE, REPORT_PERIOD, NET_PROFIT_EXCL_MIN_INT_INC, " \ "NET_PROFIT_AFTER_DED_NR_LP, OPER_REV, TOT_OPER_REV, TOT_OPER_COST, OPER_PROFIT, TOT_PROFIT, " \ "S_FA_EPS_DILUTED, MINORITY_INT_INC, LESS_FIN_EXP, NET_INT_INC, EBIT, STATEMENT_TYPE " \ "from wind_filesync.AShareIncome " \ "where ANN_DT >= {0} and ANN_DT <= {1} " \ "and (STATEMENT_TYPE = '408001000' or STATEMENT_TYPE = '408005000' or STATEMENT_TYPE = '408004000') " \ "and (s_info_windcode like '0%' or s_info_windcode like '3%' or s_info_windcode like '6%') " \ "order by report_period, ann_dt, statement_type " \ .format((dtparser.parse(str(start)) - relativedelta(years=4)).strftime('%Y%m%d'), str(end)) self.rdf.curs.execute(query) income = \ pd.DataFrame(self.rdf.curs.fetchall(), columns=['date', 'code', 'report_period', 'net_profit', 'net_profit_ddt', 'oper_rev', 'tot_oper_rev', 'tot_oper_cost', 'oper_profit', 'tot_profit', 'EPS_diluted', 'minority_interest_income', 'less_fin_exp', 'net_interest_income', 'EBIT', 'type']) income[['minority_interest_income', 'less_fin_exp', 'net_interest_income']] = \ income[['minority_interest_income', 'less_fin_exp', 'net_interest_income']].fillna(0) # 同一code,同一date,同一report_period,同时出现type1,2,3时,取type大的 income['type'] = income['type'].apply( lambda x: '2' if x == '408001000' else ('3' if x == '408005000' else '4')) income = income.sort_values( by=['code', 'date', 'report_period', 'type']) income['date'] = pd.to_datetime(income['date']) income['report_period'] = pd.to_datetime((income['report_period'])) # *************************************************************************** # 读取业绩快报 query = "select ANN_DT, S_INFO_WINDCODE, REPORT_PERIOD, OPER_REV, OPER_PROFIT, NET_PROFIT_EXCL_MIN_INT_INC, " \ "TOT_PROFIT, EPS_DILUTED " \ "from wind_filesync.AShareProfitExpress " \ "where ANN_DT >= {0} and ANN_DT <= {1} " \ "and (s_info_windcode like '0%' or s_info_windcode like '3%' or s_info_windcode like '6%') " \ "order by report_period, ann_dt" \ .format((dtparser.parse(str(start)) - relativedelta(years=4)).strftime('%Y%m%d'), str(end)) self.rdf.curs.execute(query) express = pd.DataFrame(self.rdf.curs.fetchall(), columns=[ 'date', 'code', 'report_period', 'oper_rev', 'oper_profit', 'net_profit', 'tot_profit', 'EPS_diluted' ]) express['date'] = pd.to_datetime(express['date']) express['report_period'] = pd.to_datetime(express['report_period']) express['type'] = '1' # *************************************************************************** # 读取业绩预告 query = "select S_PROFITNOTICE_DATE, S_INFO_WINDCODE, S_PROFITNOTICE_PERIOD, S_PROFITNOTICE_NETPROFITMIN, " \ "S_PROFITNOTICE_NETPROFITMAX " \ "from wind_filesync.AShareProfitNotice " \ "where S_PROFITNOTICE_DATE >= {0} and S_PROFITNOTICE_DATE <= {1} " \ "and (s_info_windcode like '0%' or s_info_windcode like '3%' or s_info_windcode like '6%') " \ "order by S_PROFITNOTICE_PERIOD, S_PROFITNOTICE_DATE" \ .format((dtparser.parse(str(start)) - relativedelta(years=4)).strftime('%Y%m%d'), str(end)) self.rdf.curs.execute(query) notice = pd.DataFrame(self.rdf.curs.fetchall(), columns=[ 'date', 'code', 'report_period', 'net_profit_min', 'net_profit_max' ]) notice['date'] = pd.to_datetime(notice['date']) notice['report_period'] = pd.to_datetime(notice['report_period']) notice['type'] = '0' notice[['net_profit_min', 'net_profit_max']] = \ notice[['net_profit_min', 'net_profit_max']].fillna(method='bfill', axis=1) notice[['net_profit_min', 'net_profit_max']] = \ notice[['net_profit_min', 'net_profit_max']].fillna(method='ffill', axis=1) # 业绩预告的单位为: 万元 notice['net_profit'] = (0.5 * notice['net_profit_min'] + 0.5 * notice['net_profit_max']) * 10000 notice.drop(['net_profit_min', 'net_profit_max'], axis=1, inplace=True) # *************************************************************************** income = pd.concat([income, express, notice], ignore_index=True) income = income.sort_values( by=['code', 'date', 'report_period', 'type']) # 经营利润 = 净利润(含少数股东损益) - 非经常性损益 + 财务费用 * (1-0.25) - 利息净收入 * (1-0.25) # = 扣非净利润(扣除少数股东损益) + 少数股东损益 + 财务费用 * (1-0.25) - 利息净收入 * (1-0.25) income['oper_income'] = income['net_profit_ddt'] + income['minority_interest_income'] + \ income['less_fin_exp'] * (1 - 0.25) - income['net_interest_income'] * (1 - 0.25) income[ 'gross_margin'] = income['tot_oper_rev'] - income['tot_oper_cost'] # 需要的field fields = [ 'net_profit', 'net_profit_ddt', 'oper_rev', 'tot_oper_rev', 'tot_oper_cost', 'oper_profit', 'tot_profit', 'gross_margin', 'EPS_diluted', 'oper_income', 'EBIT' ] #fields = ['EBIT'] # 处理数据 calendar = self.rdf.get_trading_calendar() calendar = \ set(calendar.loc[(calendar >= (dtparser.parse(str(start)) - relativedelta(years=2)).strftime('%Y%m%d')) & (calendar <= str(end))]) # 存放的db save_db = 'FinancialReport_Gus' fail_list = [] for f in fields: print('ALL ANNOUNCEMENT \n field: %s begins processing...' % f) df = pd.DataFrame(income.dropna(subset=[f]).groupby(['code', 'date', 'report_period'])[f].last()) \ .reset_index() df = df.sort_values(by=['report_period', 'date']) df.set_index(['code', 'date', 'report_period'], inplace=True) df = df.unstack(level=2) df = df.loc[:, f] df = df.reset_index().set_index('date') codes = df['code'].unique() split_codes = np.array_split(codes, n_jobs) with parallel_backend('multiprocessing', n_jobs=n_jobs): res = Parallel()(delayed(IncomeUpdate.JOB_factors)( df, f, codes, calendar, start, save_db) for codes in split_codes) print('%s finish' % f) print('-' * 30) for r in res: fail_list.extend(r) return fail_list
# test # date_family_size_list = list(zip(['2020-07-24', '2020-07-24'], ['VESTIDO', 'VESTIDO'], ['M', 'XXXL'])) # result # Out[14]: # date family_desc size mean_weight_relative mean_weight_abs stock_nok # 0 2020-07-24 VESTIDO M 0.472393 0.032209 0 # 1 2020-07-24 VESTIDO XXXL 0.722222 0.506944 1 # #### test end ######### ###################################################################################################################### # run with parallel_backend('threading', n_jobs=6): date_family_size_var_valor_list = Parallel()( delayed(get_var_distr_relat_abs)(date_family_size, df, path_results) for date_family_size in date_family_size_list) df_indicators = pd.concat(date_family_size_var_valor_list) # wothout distr_relative df_indicators_gr = df_indicators.groupby(['date', 'family_desc', 'size']).agg({ 'distr_abs': 'mean' }).reset_index() # with distr_relative # df_indicators_label = pd.merge(df_indicators, df_feedback, on=['date', 'family_desc', 'size'])
full_input_image: xa.DataArray = xa.open_rasterio(image_data_path, chunks=(35, 1000, 1000)) input_image = full_input_image[:, 1100:1400, 1100: 1400] if subset else full_input_image space_coords = { key: input_image.coords[key].values for key in space_dims } ml_input_data: np.ndarray = preprocess(input_image) nodata_output = estimator.predict(np.zeros([1, input_image.shape[0]]))[0] t1 = time.time() with joblib.parallel_backend('dask'): print( f"Executing {modelType} estimator: {saved_model_path}, parameters: { list(estimator.instance_parameters.items()) }" ) ml_results: np.ndarray = estimator.predict(ml_input_data) t2 = time.time() depth_map_data: np.ndarray = ml_results.reshape(input_image.shape[1:]) result_map = xa.DataArray(depth_map_data, coords=space_coords, dims=space_dims, name="depth_map") depth_map = result_map.where(result_map != nodata_output, 0.0) t3 = time.time()
def process_data(self, start, end, n_jobs): calendar = self.rdf.get_trading_calendar() calendar = calendar[(calendar >= str(start)) & (calendar <= str(end))] # 获取50权重 weight_50 = self.idx_comp_sql.get_IndexComp(50, start, end) weight_50['index_code'] = '000016.SH' miss_dates = set(calendar) - set(weight_50.index.unique()) if miss_dates: miss_dates = pd.DatetimeIndex(miss_dates).strftime('%Y%m%d') if miss_dates.shape[0] == 1: query = "select TRADE_DT,S_INFO_WINDCODE,S_CON_WINDCODE,weight " \ "from wind_filesync.AIndexSSE50Weight " \ "where TRADE_DT = {0}".format(miss_dates[0]) else: query = "select TRADE_DT,S_INFO_WINDCODE,S_CON_WINDCODE,weight " \ "from wind_filesync.AIndexSSE50Weight " \ "where TRADE_DT in " + str(tuple(miss_dates)) self.rdf.curs.execute(query) miss_df = pd.DataFrame( self.rdf.curs.fetchall(), columns=['date', 'index_code', 'code', 'weight']) miss_df['date'] = pd.to_datetime(miss_df['date']) miss_df.set_index('date', inplace=True) weight_50 = pd.concat([weight_50, miss_df]) # 获取300权重 weight_300 = self.idx_comp_sql.get_IndexComp(300, start, end) weight_300['index_code'] = '000300.SH' miss_dates = set(calendar) - set(weight_300.index.unique()) if miss_dates: dates_before_miss = {} for d in miss_dates: if calendar[calendar < d].empty: pass else: dates_before_miss[calendar[calendar < d].iloc[-1].strftime( '%Y%m%d')] = d.strftime('%Y%m%d') if len(dates_before_miss) == 1: query = "select TRADE_DT,S_INFO_WINDCODE,S_CON_WINDCODE,i_weight " \ "from wind_filesync.AIndexHS300Weight " \ "where TRADE_DT = {0}".format(list(dates_before_miss.keys())[0]) else: query = "select TRADE_DT,S_INFO_WINDCODE,S_CON_WINDCODE,i_weight " \ "from wind_filesync.AIndexHS300Weight " \ "where TRADE_DT in " + str(tuple(dates_before_miss.keys())) self.rdf.curs.execute(query) miss_df = pd.DataFrame( self.rdf.curs.fetchall(), columns=['last_date', 'index_code', 'code', 'weight']) miss_df['date'] = miss_df['last_date'].map(dates_before_miss) miss_df['date'] = pd.to_datetime(miss_df['date']) miss_df.drop('last_date', axis=1, inplace=True) miss_df.set_index('date', inplace=True) weight_300 = pd.concat([weight_300, miss_df]) # 获取500权重 weight_500 = self.idx_comp_sql.get_IndexComp(500, start, end) weight_500['index_code'] = '000905.SH' miss_dates = set(calendar) - set(weight_500.index.unique()) if miss_dates: miss_dates = pd.DatetimeIndex(miss_dates).strftime('%Y%m%d') if miss_dates.shape[0] == 1: query = "select TRADE_DT,S_INFO_WINDCODE,S_CON_WINDCODE,weight " \ "from wind_filesync.AIndexCSI500Weight " \ "where TRADE_DT = {0}".format(miss_dates[0]) else: query = "select TRADE_DT,S_INFO_WINDCODE,S_CON_WINDCODE,weight " \ "from wind_filesync.AIndexCSI500Weight " \ "where TRADE_DT in " + str(tuple(miss_dates)) self.rdf.curs.execute(query) miss_df = pd.DataFrame( self.rdf.curs.fetchall(), columns=['date', 'index_code', 'code', 'weight']) miss_df['date'] = pd.to_datetime(miss_df['date']) miss_df.set_index('date', inplace=True) weight_500 = pd.concat([weight_500, miss_df]) ######################################################################## weight = pd.concat([weight_50, weight_300, weight_500]) weight['weight'] = weight['weight'].astype('float') codes = weight['code'].unique() split_codes = np.array_split(codes, n_jobs) with parallel_backend('multiprocessing', n_jobs=n_jobs): res = Parallel()(delayed(influxdbData.JOB_saveData)( weight, 'code', codes, self.db, self.measure) for codes in split_codes) print('IndexWeight finish') print('-' * 30) fail_list = [] for r in res: fail_list.extend(r) return fail_list
def npoclass(inputs, gpu_core=True, model_path=None, ntee_type='bc', n_jobs=4, backend='multiprocessing', batch_size_dl=64, verbose=1): # Set the seed value all over the place to make this reproducible. seed_val = 42 random.seed(seed_val) np.random.seed(seed_val) torch.manual_seed(seed_val) # Check model files. if ntee_type == 'bc' and model_path == None: raise ValueError( "Make sure model files/path are correct. Please download from https://jima.me/open/npoclass_model_bc.zip, unzip, and specifiy model_path (default set to None)." ) if ntee_type == 'mg' and model_path == None: raise ValueError( "Make sure model files/path are correct. Please download from https://jima.me/open/npoclass_model_mg.zip, unzip, and specifiy model_path (default set to None)." ) # Check ntee type. if ntee_type == 'bc': le_file_name = 'le_broad_cat.pkl' elif ntee_type == 'mg': le_file_name = 'le_major_group.pkl' else: raise ValueError( "ntee_type must be 'bc' (broad category) or 'mg' (major group)") # Read model and label encoder, if not read. global model_loaded, tokenizer_loaded, label_encoder try: assert model_loaded assert tokenizer_loaded assert label_encoder except: #load a pretrained model and tokenizer. model_loaded = BertForSequenceClassification.from_pretrained( model_path) tokenizer_loaded = BertTokenizer.from_pretrained(model_path) # Read label encoder. with open(model_path + le_file_name, 'rb') as label_encoder_pkl: label_encoder = pickle.load(label_encoder_pkl) # Select acceleration method. if gpu_core == True and torch.cuda.is_available(): print('There are %d GPU(s) available.' % torch.cuda.device_count(), 'Using GPU:', torch.cuda.get_device_name(0)) torch.cuda.manual_seed_all(seed_val) device = torch.device('cuda') model_loaded.cuda() else: print('No GPU acceleration available or gpu_core=False, using CPU.') device = torch.device('cpu') model_loaded.cpu() print('Encoding inputs ...') sleep(.5) # Pause a second for better printing results. # Encode inputs. global func_encode_string, func_encode_string_batch # Define as global, otherwise cannot pickle or very slow. def func_encode_string(text_string): encoded_dict = tokenizer_loaded.encode_plus( text_string, add_special_tokens=True, # Add '[CLS]' and '[SEP]' truncation='longest_first', padding='max_length', # Max length accepted by model. return_attention_mask=True, # Construct attn. masks. return_tensors='pt', # Return pytorch tensors. ) return encoded_dict def func_encode_string_batch(text_strings): encoded_dicts = [] for text_string in text_strings: encoded_dicts += [func_encode_string(text_string)] return encoded_dicts # Tokenize all of the sentences and map the tokens to thier word IDs. input_ids = [] attention_masks = [] # Encode input string(s). if type(inputs) == list: if backend == 'multiprocessing': # Multiprocessing is faster than loky in processing large objects. encoded_outputs = Parallel( n_jobs=n_jobs, backend="multiprocessing", batch_size='auto', verbose=verbose)(delayed(func_encode_string)(text_string) for text_string in inputs) for encoded_output in encoded_outputs: # Add the encoded sentence to the list. input_ids.append(encoded_output['input_ids']) # And its attention mask (simply differentiates padding from non-padding). attention_masks.append(encoded_output['attention_mask']) elif backend == 'sequential': for text_string in tqdm(inputs): encoded_output = func_encode_string(text_string) # Add the encoded sentence to the list. input_ids.append(encoded_output['input_ids']) # And its attention mask (simply differentiates padding from non-padding). attention_masks.append(encoded_output['attention_mask']) elif backend == 'dask': with joblib.parallel_backend('dask'): n_jobs = len( client.scheduler_info()['workers']) # Get # works. string_chunks = partition_all( math.ceil(len(inputs) / n_jobs), inputs) # Collect into groups of size by worker numbers. encoded_outputs = Parallel( n_jobs=-1, batch_size='auto', verbose=verbose)( delayed(func_encode_string_batch)(text_strings) for text_strings in string_chunks) encoded_outputs = itertools.chain(*encoded_outputs) for encoded_output in encoded_outputs: # Add the encoded sentence to the list. input_ids.append(encoded_output['input_ids']) # And its attention mask (simply differentiates padding from non-padding). attention_masks.append(encoded_output['attention_mask']) if type(inputs) == str: encoded_output = func_encode_string(inputs) input_ids = [encoded_output['input_ids']] attention_masks = [encoded_output['attention_mask']] # Convert the lists into tensors. input_ids = torch.cat(input_ids, dim=0) attention_masks = torch.cat(attention_masks, dim=0) # Prepare dataloader for efficient calculation. pred_data = TensorDataset(input_ids, attention_masks) pred_sampler = SequentialSampler(pred_data) pred_dataloader = DataLoader(pred_data, sampler=pred_sampler, batch_size=batch_size_dl) # Start prediction. model_loaded.eval() logits_all = [] print('Predicting categories ...') sleep(.5) # Pause a second for better printing results. for batch in tqdm(pred_dataloader, mininterval=10): # Add batch to the pre-chosen device batch = tuple(t.to(device) for t in batch) b_input_ids, b_input_mask = batch with torch.no_grad(): outputs = model_loaded(b_input_ids, token_type_ids=None, attention_mask=b_input_mask) logits_all += outputs[0].tolist() # Calculate probabilities of logitcs. logits_prob = tf.nn.sigmoid(logits_all).numpy().tolist() # Find the positions of max values in logits. logits_max = np.argmax(logits_prob, axis=1) # Transfer to labels. logits_labels = label_encoder.inverse_transform(logits_max) # Compile results to be returned. result_list = [] for list_index in range(0, len(logits_labels)): result_dict = {} result_dict['recommended'] = logits_labels[list_index] conf_prob = logits_prob[list_index][logits_max[list_index]] if conf_prob >= .99: result_dict['confidence'] = 'high (>=.99)' elif conf_prob >= .95: result_dict['confidence'] = 'medium (<.99|>=.95)' else: result_dict['confidence'] = 'low (<.95)' prob_dict = {} for label_index in range(0, len(label_encoder.classes_)): prob_dict[label_encoder.classes_[label_index]] = logits_prob[ list_index][label_index] result_dict['probabilities'] = prob_dict result_list += [result_dict] return result_list
else: raise NotImplementedError monitor.times = np.array(monitor.times) monitor.objs = np.array(monitor.objs) monitor.objs_test = 0 # TODO monitor.alphas = np.array(monitor.alphas) return (dataset_name, method, tol, n_outer, tolerance_decrease, monitor.times, monitor.objs, monitor.objs_test, monitor.alphas, alpha_max, model_name) print("enter sequential") with parallel_backend("loky", inner_max_num_threads=1): results = Parallel(n_jobs=n_jobs, verbose=100)( delayed(parallel_function)( dataset_name, method, n_outer=n_outer, tolerance_decrease=tolerance_decrease, tol=tol) for dataset_name, method, n_outer, tolerance_decrease in product( dataset_names, methods, n_outers, tolerance_decreases)) print('OK finished parallel') df = pd.DataFrame(results) df.columns = [ 'dataset', 'method', 'tol', 'n_outer', 'tolerance_decrease', 'times', 'objs', 'objs_test', 'alphas', 'alpha_max', 'model_name'] for dataset_name in dataset_names:
predictors = ['TEMP', 'RH', 'PRECIP', 'U', 'V'] dilon, dilat = 6, 4 obs_data = xr.open_dataset(args.dataset) # Mask out cells where we have no PRECIP data mask = np.isnan(obs_data.PRECIP.isel(time=0)).rename("CONUS_MASK") do_hybrid = args.case == 'hybrid' print("Initializing model...") obs_model = Shen2017Model( obs_data, month=args.month, mask=mask, # lat_range=(30, 33), lon_range=(-80, -78), verbose=True, n_predictors=3, hybrid=do_hybrid, cv=args.cv) print("Fitting model...") with parallel_backend('dask.distributed', scheduler_host='localhost:8786'): obs_model.fit_parallel(-1) # Save output print("Saving to", args.output) obs_model.to_pickle(args.output) # Test prediction print("Making test prediction") obs_model.predict(obs_data).to_netcdf("test.pred.nc")
#shutil.copy2('/Users/rogerzhu/Documents/temoa/temoa-va/virginia/data/data_virginia.xlsx', '/Users/rogerzhu/Documents/temoa/temoa-va/virginia/data/data_virginia_'+str(calendar.timegm(time.gmtime()))+'.xlsx'); for modelInputs_XLSX, scenarioNames in zip(modelInputs_XLSX_list, scenarioNames_list): # ======================================================= # Move modelInputs_XLSX to database # ======================================================= modelInputs = tt.move_data_to_db(modelInputs_XLSX, path=project_path) # ======================================================= # Create directories - best completed before using multiprocessing # ======================================================= tt.create_dir(project_path=project_path, optional_dir='results') # ==================================== # Perform Simulations option = 2 # 1 - Run single, 2 - Run all # ==================================== if option == 1: # Perform single simulation evaluateModel(modelInputs, scenarioInputs, scenarioNames[0], temoa_path) elif option == 2: # Perform simulations in parallel with parallel_backend('multiprocessing', n_jobs=ncpus): Parallel(n_jobs=ncpus, verbose=5)(delayed(evaluateModel)( modelInputs, scenarioInputs, scenarioName, temoa_path, project_path, solver) for scenarioName in scenarioNames)
def evaluate(model, criterion, input_tensor, target_tensor, BOS_token, device='cuda', EOS_token=2, PAD_token=0, TARGET_LEN=30, beam_size=3, beam_search=False): model.eval() with torch.no_grad(): batch_size = input_tensor.size(0) encoder_hidden = model.encoder.initHidden(batch_size).to(device) input_tensor = input_tensor.transpose(0, 1).to(device) target_tensor = target_tensor.transpose(0, 1).to(device) input_length = input_tensor.size(0) target_length = target_tensor.size(0) #encoder_output, encoder_hidden = encoder(input_tensor, encoder_hidden) enc_outputs, encoder_hidden = model(input_tensor, encoder_hidden, batch_size, encoding=True, enc_outputs=None) decoder_input = torch.LongTensor( [BOS_token for _ in range(batch_size)]).view(-1, 1).to(device) encoder_hidden = encoder_hidden.view(model.encoder.n_layers, model.encoder.direction, batch_size, model.encoder.hidden_size) decoder_hidden = torch.cat( (encoder_hidden[:, 0, :, :], encoder_hidden[:, 1, :, :]), dim=2) def bs(i): start_node = BeamNode(decoder_hidden[:, i:i + 1, :].contiguous(), None, decoder_input[i, :].contiguous(), 0, 1) all_nodes = [start_node] now_nodes = [start_node] end_pq = PriorityQueue() for j in range(TARGET_LEN): if len(now_nodes) == 0: break pq = PriorityQueue() for node in now_nodes: input, hidden = node.idx, node.hidden output, hidden = model(input, hidden, 1, encoding=False, enc_outputs=enc_outputs[:, i:i + 1, :]) output = F.log_softmax(output, dim=1) topv, topi = output.data.topk(beam_size) for (score, idx) in zip(topv.detach().squeeze(0), topi.detach().squeeze(0)): nxt_node = BeamNode(hidden, node, idx.unsqueeze(0), node.score + score, node.length + 1) pq.put(nxt_node) now_nodes = [] for _ in range(beam_size): assert pq.qsize() > 0 node = pq.get() all_nodes.append(node) if node.idx == EOS_token or j == TARGET_LEN - 1: end_pq.put(node) else: now_nodes.append(node) assert end_pq.qsize() > 0 best_node = end_pq.get() predict = [best_node.idx.cpu().numpy()[0]] while best_node.prev is not None: best_node = best_node.prev predict.append(best_node.idx.cpu().numpy()[0]) predict = predict[-2::-1] while len(predict) < TARGET_LEN: predict.append(PAD_token) return (i, np.array(predict)) if beam_search: with parallel_backend('threading', n_jobs=-2): decoder_predict = Parallel()(delayed(bs)(i) for i in range(batch_size)) decoder_predict = sorted(decoder_predict, key=lambda x: x[0]) decoder_predict = [x[1] for x in decoder_predict] decoder_predict = np.stack(decoder_predict) return decoder_predict else: loss = 0 decoder_predict = [] for di in target_tensor: decoder_output, decoder_hidden = model(decoder_input, decoder_hidden, batch_size, encoding=False, enc_outputs=enc_outputs) loss += criterion(decoder_output, di.view(-1)) topv, topi = decoder_output.data.topk(1) decoder_input = topi.detach().to(device) decoder_predict.append(topi.cpu().numpy()) decoder_predict = np.hstack(decoder_predict) return loss.item() / target_length, decoder_predict
def paralelizeJobWhithDaskClient(function, client): c = client print(c) with joblib.parallel_backend('dask'): function()
import numpy as np from joblib import parallel_backend from sklearn.datasets import load_digits from sklearn.model_selection import RandomizedSearchCV from sklearn.svm import SVC from ray.util.joblib import register_ray register_ray() param_space = { 'C': np.logspace(-6, 6, 30), 'gamma': np.logspace(-8, 8, 30), 'tol': np.logspace(-4, -1, 30), 'class_weight': [None, 'balanced'], } model = SVC(kernel='rbf') search = RandomizedSearchCV(model, param_space, cv=5, n_iter=300, verbose=1) digits = load_digits() with parallel_backend('ray'): search.fit(digits.data, digits.target)
def nuscenes_gt_to_kitti( self, lyft_dataroot: str = "/home/yw763/driving/lyft/v1.02-train", table_folder: str = "/home/yw763/driving/lyft/v1.02-train/v1.02-train", lidar_name: str = "LIDAR_TOP", get_all_detections: bool = True, parallel_n_jobs: int = 16, samples_count: Optional[int] = None, ) -> None: """Converts nuScenes GT formatted annotations to KITTI format. Args: lyft_dataroot: folder with tables (json files). table_folder: folder with tables (json files). lidar_name: Name of the lidar sensor. Only one lidar allowed at this moment. get_all_detections: If True, will write all bboxes in PointCloud and use only FrontCamera. parallel_n_jobs: Number of threads to parralel processing. samples_count: Number of samples to convert. """ self.lyft_dataroot = lyft_dataroot self.table_folder = table_folder self.lidar_name = lidar_name self.get_all_detections = get_all_detections self.samples_count = samples_count self.parallel_n_jobs = parallel_n_jobs # Select subset of the data to look at. self.lyft_ds = LyftDataset(self.lyft_dataroot, self.table_folder) self.kitti_to_nu_lidar = Quaternion(axis=(0, 0, 1), angle=np.pi) self.kitti_to_nu_lidar_inv = self.kitti_to_nu_lidar.inverse # Get assignment of scenes to splits. split_logs = [ self.lyft_ds.get("log", scene["log_token"])["logfile"] for scene in self.lyft_ds.scene ] if self.get_all_detections: self.cams_to_see = ["CAM_FRONT"] else: self.cams_to_see = [ "CAM_FRONT", "CAM_FRONT_LEFT", "CAM_FRONT_RIGHT", "CAM_BACK", "CAM_BACK_LEFT", "CAM_BACK_RIGHT", ] # Create output folders. self.label_folder = self.store_dir.joinpath("label_2") self.calib_folder = self.store_dir.joinpath("calib") self.image_folder = self.store_dir.joinpath("image_2") self.lidar_folder = self.store_dir.joinpath("velodyne") for folder in [ self.label_folder, self.calib_folder, self.image_folder, self.lidar_folder ]: if not folder.is_dir(): folder.mkdir(parents=True) # Use only the samples from the current split. sample_tokens = self._split_to_samples(split_logs) if self.samples_count is not None: sample_tokens = sample_tokens[:self.samples_count] # print(len(sample_tokens)) sample_tokens = sample_tokens self.tokens = sample_tokens with parallel_backend("threading", n_jobs=self.parallel_n_jobs): Parallel()(delayed(self.process_token_to_kitti)(sample_token) for sample_token in tqdm(sample_tokens))
def function_multiprocessing( function: Callable, kwargs_list: List[Dict[str, Any]], recombined_epc: Union[Path, str], cluster, consolidate: bool = True, ) -> List[bool]: """Calls a function concurrently with the specfied arguments. A multiprocessing pool is used to call the function multiple times in parallel. Once all results are returned, they are combined into a single epc file. Args: function (Callable): the function to be called. Needs to return: - index (int): the index of the kwargs in the kwargs_list. - success (bool): whether the function call was successful, whatever that definiton is. - epc_file (Path/str): the epc file path where the objects are stored. - uuid_list (List[str]): list of UUIDs of relevant objects. kwargs_list (List[Dict[Any]]): A list of keyword argument dictionaries that are used when calling the function. recombined_epc (Path/str): A pathlib Path or path string of where the combined epc will be saved. cluster (LocalCluster/JobQueueCluster): a LocalCluster is a Dask cluster on a local machine. If using a job queing system, a JobQueueCluster can be used such as an SGECluster, SLURMCluster, PBSCluster, LSFCluster etc. consolidate (bool): if True and an equivalent part already exists in a model, it is not duplicated and the uuids are noted as equivalent. Returns: success_list (List[bool]): A boolean list of successful function calls. Note: This function uses the Dask backend to run the given function in parallel, so a Dask cluster must be setup and passed as an argument. Dask will need to be installed in the Python environment because it is not a dependency of the project. More info can be found at https://docs.dask.org/en/latest/deploying.html """ log.info("Multiprocessing function called with %s function.", function.__name__) for i, kwargs in enumerate(kwargs_list): kwargs["index"] = i with parallel_backend("dask"): results = Parallel()(delayed(function)(**kwargs) for kwargs in kwargs_list) log.info("Function calls complete.") # Sorting the results by the original kwargs_list index. results = list(sorted(results, key = lambda x: x[0])) success_list = [result[1] for result in results] epc_list = [result[2] for result in results] uuids_list = [result[3] for result in results] log.info("Number of successes: %s/%s.", sum(success_list), len(results)) epc_file = Path(str(recombined_epc)) if epc_file.is_file(): model_recombined = Model(epc_file = str(epc_file)) else: model_recombined = new_model(epc_file = str(epc_file)) log.info("Creating the recombined epc file.") for i, epc in enumerate(epc_list): if epc is None: continue while True: try: model = Model(epc_file = epc) break except FileNotFoundError: time.sleep(1) continue uuids = uuids_list[i] if uuids is None: uuids = model.uuids() for uuid in uuids: model_recombined.copy_uuid_from_other_model(model, uuid = uuid, consolidate = consolidate) # Deleting temporary directory. log.info("Deleting the temporary directory") rm_tree("tmp_dir") model_recombined.store_epc() log.info("Recombined epc file complete.") return success_list
svc_c = trial.suggest_float("svc_c", 1e-10, 1e10, log=True) classifier_obj = sklearn.svm.SVC(C=svc_c, gamma="auto") else: rf_max_depth = trial.suggest_int("rf_max_depth", 2, 32, log=True) classifier_obj = sklearn.ensemble.RandomForestClassifier( max_depth=rf_max_depth, n_estimators=10 ) score = sklearn.model_selection.cross_val_score(classifier_obj, x, y, n_jobs=-1, cv=3) accuracy = score.mean() return accuracy if __name__ == "__main__": study = optuna.create_study(direction="maximize") with joblib.parallel_backend("ray", n_jobs=-1): study.optimize(objective, n_trials=100) print(f"Number of finished trials: {len(study.trials)}") print(f"Elapsed time: {study.trials[-1].datetime_complete - study.trials[0].datetime_start}") print("Best trial:") trial = study.best_trial print(f" Value: {trial.value}") print(" Params: ") for key, value in trial.params.items(): print(f" {key}: {value}")
# If you are on a UNIX system, it is possible to fallback to the old # ``multiprocessing`` backend, which can pickle interactively defined functions # with the default pickle module, which is faster for such large objects. # if sys.platform != 'win32': if IS_RUN_WITH_SPHINX_GALLERY: # When this example is run with sphinx gallery, it breaks the pickling # capacity for multiprocessing backend so we have to modify the way we # define our functions. This has nothing to do with the example. from utils import func_async else: def func_async(i, *args): return 2 * i with parallel_backend('multiprocessing'): t_start = time.time() Parallel(n_jobs=2)( delayed(func_async)(21, large_list) for _ in range(1)) print("With multiprocessing backend and pickle serialization: {:.3f}s" .format(time.time() - t_start)) ############################################################################### # However, using ``fork`` to start new processes can cause violation of the # POSIX specification and can have bad interaction with compiled extensions # that use ``openmp``. Also, it is not possible to start processes with # ``fork`` on windows where only ``spawn`` is available. The ``loky`` backend # has been developped to mitigate these issues. # # To have fast pickling with ``loky``, it is possible to rely on ``pickle`` to