def test_dont_assume_function_purity(loop):
    with cluster() as (s, [a, b]):
        with parallel_backend('distributed', loop=loop,
                scheduler_host=('127.0.0.1', s['port'])):

            x, y = Parallel()(delayed(random2)() for i in range(2))
            assert x != y
def test_simple(loop):
    with cluster() as (s, [a, b]):
        with parallel_backend('distributed', loop=loop,
                scheduler_host=('127.0.0.1', s['port'])):

            seq = Parallel()(delayed(inc)(i) for i in range(10))
            assert seq == [inc(i) for i in range(10)]

            seq = Parallel()(delayed(inc)(i) for i in range(10))
            assert seq == [inc(i) for i in range(10)]
Beispiel #3
0
def test_dont_assume_function_purity(loop):
    with cluster() as (s, [a, b]):
        with parallel_backend('distributed', loop=loop,
                scheduler_host=('127.0.0.1', s['port'])) as p:

            x, y = Parallel()(delayed(random2)() for i in range(2))
            assert x != y

            from joblib.parallel import get_active_backend
            ba, _ = get_active_backend()
            ba.executor.shutdown()
Beispiel #4
0
def test_simple(loop):
    with cluster() as (s, [a, b]):
        with parallel_backend('distributed', loop=loop,
                scheduler_host=('127.0.0.1', s['port'])) as p:

            seq = Parallel()(delayed(inc)(i) for i in range(10))
            assert seq == [inc(i) for i in range(10)]

            with pytest.raises(ValueError):
                Parallel()(delayed(slow_raise_value_error)(i == 3)
                    for i in range(10))

            seq = Parallel()(delayed(inc)(i) for i in range(10))
            assert seq == [inc(i) for i in range(10)]

            from joblib.parallel import get_active_backend
            ba, _ = get_active_backend()
            ba.executor.shutdown()
def run_simulation(delta_reg, previous_params):
    config_vmc_file = import_config(sys.argv[1])
    config_vmc_import = config_vmc_file.MC_parameters(int(sys.argv[2]),
                                                      int(sys.argv[3]), rank)

    config_vmc = cv_module.MC_parameters(int(sys.argv[2]), int(sys.argv[3]),
                                         rank)
    config_vmc.__dict__ = config_vmc_import.__dict__.copy()

    print_model_summary(config_vmc)

    if previous_params is not None:
        config_vmc.initial_parameters = previous_params

    config_vmc.workdir = config_vmc.workdir + '/irrep_{:d}_Ne_{:d}/'.format(
        rank, int(sys.argv[3]))

    os.makedirs(config_vmc.workdir, exist_ok=True)
    with open(os.path.join(config_vmc.workdir, 'config.py'), 'w') as target, \
         open(sys.argv[1], 'r') as source:  # save config file to workdir (to remember!!)
        target.write(source.read())

    if config_vmc.visualisation:
        # config_vmc.twist = [np.exp(2.0j * np.pi * 0.1904), np.exp(2.0j * np.pi * (0.1904 + 0.1))]
        visualisation.plot_levels_evolution_mu(config_vmc)
        visualisation.plot_all_waves(config_vmc)
        visualisation.plot_DOS(config_vmc)
        visualisation.plot_fermi_surface(config_vmc)
        visualisation.plot_all_waves(config_vmc)
        visualisation.plot_all_Jastrow(config_vmc)
        visualisation.plot_MF_spectrum_profile(config_vmc)

    config_vmc.twist = [
        1, 1
    ]  #[np.exp(2.0j * np.pi * 0.1904), np.exp(2.0j * np.pi * (0.1904 + 0.10))]
    if config_vmc.tests:
        if rank == 0:
            if tests.perform_all_tests(config_vmc):
                print('\033[92m All tests passed successfully \033[0m',
                      flush=True)
            else:
                print('\033[91m Warning: some of the tests failed! \033[0m',
                      flush=True)
    comm.Barrier()

    n_cpus_max = psutil.cpu_count(logical=True)
    print('max available CPUs:', n_cpus_max)
    n_cpus = config_vmc.n_cpus
    if config_vmc.n_cpus == -1:
        n_cpus = n_cpus_max
    print('performing simulation at', n_cpus, 'CPUs')

    ### generate twists once and for all (Sandro's suggestion) ###

    if config_vmc.twist_mesh == 'Baldereschi':
        print('Working with the Baldereschi mesh')
        if config_vmc.n_sublattices == 2:
            twists = [[
                np.exp(2.0j * np.pi * 0.1904),
                np.exp(2.0j * np.pi * (0.1904 + 0.1))
            ] for _ in range(config_vmc.n_chains)]
        if config_vmc.n_sublattices == 1:
            twists = [[1., -1.] for _ in range(config_vmc.n_chains)]  # FIXME
        twists_per_cpu = config_vmc.n_chains / n_cpus
    elif config_vmc.twist_mesh == 'PBC':
        twists = [[1., 1.] for _ in range(config_vmc.n_chains)]
        twists_per_cpu = config_vmc.n_chains / n_cpus
    elif config_vmc.twist_mesh == 'APBCy':
        twists = [[1., -1.] for _ in range(config_vmc.n_chains)]
        twists_per_cpu = config_vmc.n_chains / n_cpus
    elif config_vmc.twist_mesh == 'reals':
        assert config_vmc.n_chains == 4
        twists = [[1, 1], [1, -1], [-1, 1], [-1, -1]]
        twists_per_cpu = config_vmc.n_chains / n_cpus
        assert twists_per_cpu == 1
    elif config_vmc.twist_mesh == 'uniform':
        L = config_vmc.L_twists_uniform
        twists = []
        for i_x in range(L):
            for i_y in range(L):
                twists.append([
                    np.exp(1.0j * np.pi * (-1. + 1. / L + 2. * i_x / L)),
                    np.exp(1.0j * np.pi * (-1. + 1. / L + 2. * i_y / L))
                ])
        twists_per_cpu = len(twists) // n_cpus
        if twists_per_cpu * n_cpus < len(twists):
            twists_per_cpu += 1
    else:
        print('Twist {:s} is not supported'.format(config_vmc.twist_mesh))
        exit(-1)
    print(twists)
    print(
        'Number of twists: {:d}, number of chains {:d}, twists per cpu {:2f}'.
        format(len(twists), config_vmc.n_chains, twists_per_cpu))
    K_matrices_up = [
        models.apply_TBC(config_vmc,
                         twist,
                         deepcopy(config_vmc.K_0),
                         inverse=False) for twist in twists
    ]
    print(repr(K_matrices_up[0]))
    print(config_vmc.K_0)
    #exit(-1)
    K_matrices_down = [
        models.apply_TBC(config_vmc,
                         twist,
                         deepcopy(config_vmc.K_0).T,
                         inverse=True) for twist in twists
    ]
    reg_terms = [models.apply_TBC(config_vmc, twist, deepcopy(config_vmc.reg_gap_term), inverse = False) * \
                 config_vmc.reg_gap_val for twist in twists]

    config_vmc.MC_chain = config_vmc.MC_chain // len(
        twists)  # the MC_chain contains the total required number of samples

    pairings_list = config_vmc.pairings_list
    pairings_names = config_vmc.pairings_list_names

    # template = 'e_{:.2f}_Ne_{:d}'.format(config_vmc.epsilon, config_vmc.Ne) if config_vmc.PN_projection else \
    #            'e_{:.2f}_mu_{:.2f}'.format(config_vmc.epsilon, config_vmc.mu)

    local_workdir = config_vmc.workdir

    obs_files = []
    loaded_from_external = False
    if config_vmc.load_parameters:
        if config_vmc.load_parameters_path is not None:
            loaded_from_external = True
            filename = config_vmc.load_parameters_path
            parameters, last_step = load_parameters(filename)
        elif os.path.isfile(os.path.join(local_workdir, 'last_opt_params.p')):
            filename = os.path.join(local_workdir, 'last_opt_params.p')
            parameters, last_step = load_parameters(filename)
        else:
            parameters = config_vmc.initial_parameters
            last_step = 0
    else:
        parameters = config_vmc.initial_parameters
        last_step = 0

    if config_vmc.condensation_energy_check_regime:
        parameters[config_vmc.layout[:3].sum():config_vmc.layout[:4].sum(
        )] = 0.

    log_file = open(os.path.join(local_workdir, 'general_log.dat'), 'a+')
    force_file = open(os.path.join(local_workdir, 'force_log.dat'), 'a+')
    gaps_file = open(os.path.join(local_workdir, 'gaps_log.dat'), 'a+')
    force_SR_file = open(os.path.join(local_workdir, 'force_SR_log.dat'), 'a+')

    spectral_file = open(os.path.join(local_workdir, 'spectral_log.dat'), 'a+')
    final_states = [False] * len(twists)
    orbitals_in_use = [None] * len(twists)

    ### write log header only if we start from some random parameters ###
    if last_step == 0 or loaded_from_external:
        write_initial_logs(log_file, force_file, force_SR_file, config_vmc)

    #for n_step in range(last_step, config_vmc.optimisation_steps):
    n_step = last_step
    while n_step < config_vmc.optimisation_steps:
        t = time()
        if twists_per_cpu > 1:
            with parallel_backend("loky", inner_max_num_threads=1):
                results_batched = Parallel(n_jobs=n_cpus)(delayed(get_MC_chain_result)( \
                        n_step, \
                        deepcopy(config_vmc), \
                        pairings_list, \
                        parameters, \
                        twists = twists[i * twists_per_cpu:np.min([(i + 1) * twists_per_cpu, len(twists)])], \
                        final_states = final_states[i * twists_per_cpu:np.min([(i + 1) * twists_per_cpu, len(twists)])], \
                        orbitals_in_use = orbitals_in_use[i * twists_per_cpu:np.min([(i + 1) * twists_per_cpu, len(twists)])], \
                        K_matrices_up = K_matrices_up[i * twists_per_cpu:np.min([(i + 1) * twists_per_cpu, len(twists)])], \
                        K_matrices_down = K_matrices_down[i * twists_per_cpu:np.min([(i + 1) * twists_per_cpu, len(twists)])], \
                        regs = reg_terms[i * twists_per_cpu:np.min([(i + 1) * twists_per_cpu, len(twists)])], \
                    ) for i in range(n_cpus))
                #for i in range(n_cpus):
                #    print(i * twists_per_cpu, np.min([(i + 1) * twists_per_cpu, len(twists)]))
                results = []
                for i, r in enumerate(results_batched):
                    results = results + r
                    print('obtained {:d} results from {:d} cpu'.format(
                        len(r), i))
                print('obtained in total {:d} results'.format(len(results)))
                #print(len(twists))
        else:
            with parallel_backend("loky", inner_max_num_threads=1):
                results = Parallel(n_jobs=config_vmc.n_chains)(delayed(_get_MC_chain_result)( \
                        n_step, \
                        deepcopy(config_vmc), \
                        pairings_list, \
                        parameters, \
                        twists[i], \
                        final_states[i], \
                        orbitals_in_use[i], \
                        K_matrices_up[i], \
                        K_matrices_down[i], \
                        reg_terms[i], \
                    ) for i in range(config_vmc.n_chains))
        print('MC chain generationof {:d} no {:d} took {:f}'.format(
            rank, n_step,
            time() - t))
        t = time()

        ### print-out current energy levels ###
        E = results[0][7]
        spectral_file.write(
            str(n_step) + ' ' + ("{:.7f} " * len(E) + '\n').format(*E))
        spectral_file.flush()
        ### MC chains data extraction ###
        gaps, gap, energies, mean_variance, Os, acceptance, \
            final_states, densities, orbitals_in_use, occupied_numbers = \
            extract_MC_data(results, config_vmc, config_vmc.n_chains)
        energies_merged = np.concatenate(energies)

        n_above_FS = len(
            np.setdiff1d(occupied_numbers[0],
                         np.arange(config_vmc.total_dof // 2)))
        ### gradient step ###
        if config_vmc.generator_mode:  # evolve parameters only if it's necessary
            mask = np.ones(np.sum(config_vmc.layout))
            factor_stages = 1
            if n_step < 100:  # jastrows and mu_BCS have not converged yet
                mask = np.zeros(np.sum(config_vmc.layout))
                mask[-config_vmc.layout[4]:] = 1.
                factor_stages = 30.
            #    # mask[:config_vmc.layout[0]] = 1.
            #    mask[config_vmc.layout[0] + config_vmc.layout[1] + config_vmc.layout[2]:config_vmc.layout[0] + \
            #         config_vmc.layout[1] + config_vmc.layout[2] + config_vmc.layout[3]] = 0.
            #mask[1] = 0.0  # fugacity is not optimized in the meantime

            # Os = [np.einsum('ik,k->ik', Os_theta, config_vmc.mask) for Os_theta in Os]

            step, forces = make_SR_step(Os, energies, config_vmc, twists, gaps,
                                        n_step, mask)

            write_intermediate_log(log_file, force_file, force_SR_file, n_step, config_vmc.total_dof // 2, energies, densities, \
                                   mean_variance, acceptance, forces, step, gap, n_above_FS, parameters)  # write parameters before step not to lose the initial values

            write_gaps_log(gaps_file, gaps, n_step)

            #if np.abs(gap) < 1e-4:  # if the gap is too small, SR will make gradient just 0
            #    step = forces
            #step = forces * config_vmc.opt_parameters[1]
            step = step * config_vmc.opt_parameters[1]
            #step = clip_forces(config_vmc.all_clips, step)

            parameters += step * mask * factor_stages  # lr better be ~0.01..0.1

            #if parameters[0] < config_vmc.mu_BCS_min:
            #    parameters[0] = config_vmc.mu_BCS_min
            #if parameters[0] > config_vmc.mu_BCS_max:
            #    parameters[0] = config_vmc.mu_BCS_max

            save_parameters(parameters, config_vmc.workdir, n_step)
        ### END SR STEP ###

        observables = np.concatenate([np.array(x[4]) for x in results], axis=0)
        observables_names = results[0][5]
        n_step += 1
        if len(observables_names) == 0:
            continue

        if n_step == config_vmc.thermalization + 1:
            create_obs_files(observables_names, config_vmc)

        write_observables(n_step, obs_files, observables, config_vmc)
        if rank == 0:
            print('SR and logging {:d} took {:f}'.format(n_step, time() - t))

    log_file.close()
    force_file.close()
    force_SR_file.close()
    spectral_file.close()

    [file.close() for file in obs_files]
    return parameters
Beispiel #6
0
from pyitab.analysis.searchlight import SearchLight
from sklearn.model_selection import *
from pyitab.utils import load_test_dataset
import joblib
from dask.distributed import Client
from dask_kubernetes import KubeCluster

cluster = KubeCluster.from_yaml('pods.yml')
                                                                                                                          
pods = cluster.scale(6)
client = Client(cluster.scheduler_address)

ds = load_test_dataset()

cv = KFold()
with joblib.parallel_backend('dask', scatter=[ds]): 
    scores = SearchLight(cv=cv).fit(ds) 
def handle_call(func, args, kwargs, proxy_positions=[]):
    if len(proxy_positions) > 0:
        args, kwargs = replace_with_values(args, kwargs, proxy_positions)

    with parallel_backend('sequential'):
        return func(*args, **kwargs)
    c = Client(cluster)

    # Query the client for all connected workers
    workers = c.has_what().keys()
    n_workers = len(workers)
    df = cudf.read_csv(os.path.join(data_dir, "train.csv"))
    N_TRIALS = 5

    # Drop non-numerical data and fill NaNs before passing to cuML RF
    CAT_COLS = list(df.select_dtypes('object').columns)
    df = df.drop(CAT_COLS, axis=1)
    df = df.dropna()

    df = df.astype("float32")
    X, y = df.drop(["target"], axis=1), df["target"].astype('int32')

    study_name = "dask_optuna_lr_log_loss_tpe"
    storage_name = "sqlite:///study_stores.db"

    storage = dask_optuna.DaskStorage(storage_name)
    study = optuna.create_study(sampler=optuna.samplers.TPESampler(),
                                study_name=study_name,
                                direction="minimize",
                                storage=storage)
    # Optimize in parallel on your Dask cluster
    with parallel_backend("dask"):
        study.optimize(lambda trial: objective(trial, X, y),
                       n_trials=N_TRIALS,
                       n_jobs=n_workers)
    print('Best params{} and best score{}'.format(study.best_params,
                                                  study.best_value))
Beispiel #9
0
def main():
    unsupervised_models = ['OCSVM', 'IF']
    is_supervised = args.model not in unsupervised_models
    # outliers = args.outliers / 100
    outliers = 0.05
    in_class = 1.0 - outliers

    pipeline = Pipeline([
        ('std', None),
        ('clf', None),
    ])
    if args.model == 'SVC':
        params = [
            {
                'std': [MinMaxScaler(), StandardScaler(), None],
                'clf': [SVC()],
                'clf__kernel': ['rbf', 'poly'],
                'clf__C': [1e-2, 1e-1, 1e0, 1e1, 1e2, 1e3],
                'clf__gamma': ['scale'],
            },
        ]
    elif args.model == 'LSVC':
        params = [
            {
                'std': [MinMaxScaler(), StandardScaler(), None],
                'clf': [LinearSVC()],
                'clf__C': [1e-2, 1e-1, 1e0, 1e1, 1e2, 1e3],
                'clf__max_iter': [100000],
            },
        ]
    elif args.model == 'LR':
        params = [
            {
                'std': [MinMaxScaler(), StandardScaler(), None],
                'clf': [LogisticRegression()],
            },
        ]
    elif args.model == 'RF':
        params = [
            {
                'std': [MinMaxScaler(), StandardScaler(), None],
                'clf': [RandomForestClassifier()],
                'clf__n_estimators': [
                    500,
                ],
                'clf__max_depth': [None, 2, 8, 16],
                'clf__min_samples_split': [2, 0.1, 0.5],
                'clf__max_features': ['sqrt', 'log2'],
            },
        ]
    elif args.model == 'GB':
        params = [
            {
                'std': [MinMaxScaler(), StandardScaler(), None],
                'clf': [GradientBoostingClassifier()],
                'clf__loss': ['deviance', 'exponential'],
                'clf__learning_rate': [0.5, 0.1, 0.01, 0.001],
                'clf__n_estimators': [32, 100, 200, 500],
                'clf__max_depth': [2, 4, 8, 16],
                'clf__min_samples_split': [2, 0.1, 0.5],
            },
        ]
    elif args.model == 'IF':
        params = [
            {
                'std': [MinMaxScaler(), StandardScaler(), None],
                'clf': [IsolationForest()],
                'clf__n_estimators': [20, 50, 100, 200],
                'clf__contamination':
                [outliers, outliers + 0.025, outliers - 0.025],
                'clf__max_samples': ['auto', 0.1],
                'clf__bootstrap': [True, False],
                'clf__behaviour': [True],
            },
        ]
    elif args.model == 'OCSVM':
        params = [
            {
                'std': [MinMaxScaler(), StandardScaler(), None],
                'clf': [OneClassSVM()],
                'clf__kernel': ['rbf', 'poly', 'linear'],
                'clf__gamma': ['scale', 'auto'],
                'clf__nu': [outliers / 2, outliers, outliers * 2],
            },
        ]

    run_dir_name = args.run_dir
    name_prefix = ''
    name_prefix += 'ablation_re_' if args.only_re else ''
    name_prefix += 'ablation_ln_' if args.only_ln else ''
    assert args.only_re or args.only_ln
    chosen_ind = 0 if args.only_re else 1

    dataset_names = ['train']
    for adv_type in adv_types:
        dataset_names.append(f'clean_{adv_type}')
        dataset_names.append(f'adv_{adv_type}')
        dataset_names.append(f'noisy_{adv_type}')

    # TODO results dict, aggregate and print mean and stddev to a new file
    results = {}
    for run_n in range(args.runs):
        results[run_n] = {}
        run_dir = Path(f'{run_dir_name}_{run_n}')
        run_name = str(run_dir)
        assert run_dir.exists()

        datasets = {}
        for name in dataset_names:
            dataset_path = run_dir / f'ae_encoded_{name}.npy'
            if dataset_path.exists():
                loaded = np.load(str(dataset_path))
                print(f'loaded.shape: {loaded.shape}')
                # TODO
                filtered = loaded[:, chosen_ind::2]
                print(f'filtered.shape: {filtered.shape}')
                datasets[name] = filtered
            else:
                print(f'{dataset_path} is missing!')

        # for supervised we consider two setups - "known attack" (left half of table 3 from "A Simple Unified Framework...")
        # and "unknown attack" where we train only on FGSM and validate on the rest
        # for unsupervised we train on entire training data(! - change if needed) and test on clean/adv/noisy
        if is_supervised:
            # "known" part
            results_filename = f'{run_name}/{name_prefix}results_{args.model}_known.txt'
            with open(results_filename, 'w') as results_file:
                for adv_type in adv_types:
                    model_filename = f'{run_name}/{name_prefix}final_cv_{args.model}_known_{adv_type}.joblib'
                    train_split = 0.1
                    train_size = int(train_split *
                                     len(datasets[f'clean_{adv_type}']))
                    test_size = len(datasets[f'clean_{adv_type}']) - train_size
                    X = np.concatenate([
                        datasets[f'clean_{adv_type}'][:train_size],
                        datasets[f'adv_{adv_type}'][:train_size],
                        datasets[f'noisy_{adv_type}'][:train_size],
                    ])
                    y = np.concatenate([
                        np.ones(train_size),
                        np.zeros(train_size),
                        np.ones(train_size),
                    ])
                    X_test = np.concatenate([
                        datasets[f'clean_{adv_type}'][train_size:],
                        datasets[f'adv_{adv_type}'][train_size:],
                        datasets[f'noisy_{adv_type}'][train_size:],
                    ])
                    y_test = np.concatenate([
                        np.ones(test_size),
                        np.zeros(test_size),
                        np.ones(test_size),
                    ])
                    if not Path(model_filename).exists():
                        # train
                        with parallel_backend('loky', n_jobs=args.jobs):
                            gs = GridSearchCV(pipeline,
                                              params,
                                              scoring=make_scorer(
                                                  roc_auc_score,
                                                  needs_threshold=True),
                                              cv=StratifiedKFold(5),
                                              verbose=1)
                            gs.fit(X, y)
                        # save model
                        joblib.dump(gs, model_filename)
                    else:
                        gs = joblib.load(model_filename)
                    print(f'Best params on {adv_type}: {gs.best_params_}',
                          file=results_file)
                    # print feature importance on Random Forest
                    if args.model == 'RF':
                        rf = gs.best_estimator_['clf']
                        print(
                            f'RF feature importance for {adv_type}: \n {rf.feature_importances_.tolist()}',
                            file=results_file)
                    # validate
                    y_pred = gs.predict(X_test)
                    try:
                        y_scores = gs.decision_function(X_test)
                    except:
                        y_scores = gs.predict_proba(X_test)
                        if y_scores.ndim > 1:
                            y_scores = y_scores[:, 1]
                    acc = accuracy_score(y_test, y_pred)
                    auroc = roc_auc_score(y_test, y_scores)
                    print(f'Accuracy on {adv_type}: {acc}', file=results_file)
                    results[run_n][f'acc_known_{adv_type}'] = acc
                    print(f'AUROC on {adv_type}: {auroc}', file=results_file)
                    results[run_n][f'auroc_known_{adv_type}'] = auroc
            # "unknown/FGSM" part
            results_filename = f'{run_name}/{name_prefix}results_{args.model}_unknown.txt'
            with open(results_filename, 'w') as results_file:
                model_filename = f'{run_name}/{name_prefix}final_cv_{args.model}_unknown.joblib'
                # train on FGSM
                train_split = 0.1
                train_size = int(train_split *
                                 len(datasets[f'clean_{adv_types[0]}']))
                test_size = len(datasets[f'clean_{adv_types[0]}']) - train_size
                X = np.concatenate([
                    datasets[f'clean_{adv_types[0]}'][:train_size],
                    datasets[f'adv_{adv_types[0]}'][:train_size],
                    datasets[f'noisy_{adv_types[0]}'][:train_size],
                ])
                y = np.concatenate([
                    np.ones(train_size),
                    np.zeros(train_size),
                    np.ones(train_size),
                ])
                X_test = np.concatenate([
                    datasets[f'clean_{adv_types[0]}'][train_size:],
                    datasets[f'adv_{adv_types[0]}'][train_size:],
                    datasets[f'noisy_{adv_types[0]}'][train_size:],
                ])
                y_test = np.concatenate([
                    np.ones(test_size),
                    np.zeros(test_size),
                    np.ones(test_size),
                ])
                if not Path(model_filename).exists():
                    # train
                    with parallel_backend('loky', n_jobs=args.jobs):
                        gs = GridSearchCV(pipeline,
                                          params,
                                          scoring=make_scorer(
                                              roc_auc_score,
                                              needs_threshold=True),
                                          cv=StratifiedKFold(5),
                                          verbose=1)
                        gs.fit(X, y)
                    # save model
                    joblib.dump(gs, model_filename)
                else:
                    gs = joblib.load(model_filename)
                print(f'Best params: {gs.best_params_}', file=results_file)
                # print feature importance on Random Forest
                if args.model == 'RF':
                    rf = gs.best_estimator_['clf']
                    print(
                        f'RF feature importance: \n {rf.feature_importances_.tolist()}',
                        file=results_file)
                # test
                y_pred = gs.predict(X_test)
                try:
                    y_scores = gs.decision_function(X_test)
                except:
                    y_scores = gs.predict_proba(X_test)
                    if y_scores.ndim > 1:
                        y_scores = y_scores[:, 1]
                acc = accuracy_score(y_test, y_pred)
                auroc = roc_auc_score(y_test, y_scores)
                print(f'Accuracy on {adv_types[0]}: {acc}', file=results_file)
                results[run_n][f'acc_unknown_{adv_types[0]}'] = acc
                print(f'AUROC on {adv_types[0]}: {auroc}', file=results_file)
                results[run_n][f'auroc_unknown_{adv_types[0]}'] = auroc
                # and test on the rest
                for adv_type in adv_types[1:]:
                    test_size = len(datasets[f'clean_{adv_type}'])
                    X_test = np.concatenate([
                        datasets[f'clean_{adv_type}'],
                        datasets[f'adv_{adv_type}'],
                        datasets[f'noisy_{adv_type}'],
                    ])
                    y_test = np.concatenate([
                        np.ones(test_size),
                        np.zeros(test_size),
                        np.ones(test_size),
                    ])
                    # validate
                    y_pred = gs.predict(X_test)
                    try:
                        y_scores = gs.decision_function(X_test)
                    except:
                        y_scores = gs.predict_proba(X_test)
                        if y_scores.ndim > 1:
                            y_scores = y_scores[:, 1]
                    acc = accuracy_score(y_test, y_pred)
                    auroc = roc_auc_score(y_test, y_scores)
                    print(f'Accuracy on {adv_type}: {acc}', file=results_file)
                    results[run_n][f'acc_unknown_{adv_type}'] = acc
                    print(f'AUROC on {adv_type}: {auroc}', file=results_file)
                    results[run_n][f'auroc_unknown_{adv_type}'] = auroc
        else:
            model_filename = f'{run_name}/{name_prefix}final_cv_{args.model}.joblib'
            results_filename = f'{run_name}/{name_prefix}results_{args.model}.txt'
            if not Path(model_filename).exists():
                # use only train dataset for one-class classifiers
                X = datasets[f'train']
                train_size = len(X)
                y = np.ones(train_size)
                with parallel_backend('loky', n_jobs=args.jobs):
                    gs = GridSearchCV(pipeline,
                                      params,
                                      scoring=make_scorer(
                                          score_func, greater_is_better=False),
                                      cv=5,
                                      verbose=1)
                    gs.fit(X, y)
                # save model
                joblib.dump(gs, model_filename)
            else:
                gs = joblib.load(model_filename)
            # evaluate
            for adv_type in adv_types:
                test_size = len(datasets[f'clean_{adv_type}'])
                X_test = np.concatenate([
                    datasets[f'clean_{adv_type}'],
                    datasets[f'adv_{adv_type}'],
                    datasets[f'noisy_{adv_type}'],
                ])
                y_test = np.concatenate([
                    np.ones(test_size),
                    np.zeros(test_size),
                    np.ones(test_size),
                ])
                y_pred = gs.predict(X_test)
                try:
                    y_scores = gs.decision_function(X_test)
                except:
                    y_scores = gs.predict_proba(X_test)[0]
                acc = accuracy_score(y_test, y_pred)
                auroc = roc_auc_score(y_test, y_scores)
                results[run_n][f'acc_{adv_type}'] = acc
                results[run_n][f'auroc_{adv_type}'] = auroc
            # save results
            with open(results_filename, 'w') as results_file:
                print(f'Best score: {gs.best_score_}', file=results_file)
                print(f'Best params: {gs.best_params_}', file=results_file)
                for adv_type in adv_types:
                    print(
                        f"Accuracy on {adv_type}: {results[run_n][f'acc_{adv_type}']}",
                        file=results_file)
                    print(
                        f"AUROC on {adv_type}: {results[run_n][f'auroc_{adv_type}']}",
                        file=results_file)
    results_filename = f'{name_prefix}{run_dir_name}_{args.model}.txt'
    with open(results_filename, 'w') as results_file:
        for adv_type in adv_types:
            if is_supervised:
                # known
                res = np.array([
                    results[i][f'acc_known_{adv_type}']
                    for i in range(args.runs)
                ])
                print(
                    f'Acc on {adv_type}(known): {res.mean()} +/- {res.std(ddof=1)}',
                    file=results_file)
                res = np.array([
                    results[i][f'auroc_known_{adv_type}']
                    for i in range(args.runs)
                ])
                print(
                    f'AUROC on {adv_type}(known): {res.mean()} +/- {res.std(ddof=1)}',
                    file=results_file)
                # unknown
                res = np.array([
                    results[i][f'acc_unknown_{adv_type}']
                    for i in range(args.runs)
                ])
                print(
                    f'Acc on {adv_type}(unknown): {res.mean()} +/- {res.std(ddof=1)}',
                    file=results_file)
                res = np.array([
                    results[i][f'auroc_unknown_{adv_type}']
                    for i in range(args.runs)
                ])
                print(
                    f'AUROC on {adv_type}(unknown): {res.mean()} +/- {res.std(ddof=1)}',
                    file=results_file)
            else:
                res = np.array(
                    [results[i][f'acc_{adv_type}'] for i in range(args.runs)])
                print(f'Acc on {adv_type}: {res.mean()} +/- {res.std(ddof=1)}',
                      file=results_file)
                res = np.array([
                    results[i][f'auroc_{adv_type}'] for i in range(args.runs)
                ])
                print(
                    f'AUROC on {adv_type}: {res.mean()} +/- {res.std(ddof=1)}',
                    file=results_file)
Beispiel #10
0
import os
os.environ['SKLEARN_SITE_JOBLIB'] = "1"
from dask.distributed import Client
from sklearn import datasets, linear_model
from sklearn.model_selection import cross_validate
import joblib

client = Client(processes=False)
joblib.parallel_backend('dask')

diabetes = datasets.load_diabetes()
X = diabetes.data[:50]
y = diabetes.target[:50]

model = linear_model.LinearRegression()

cv_results = cross_validate(model,
                            X,
                            y,
                            cv=10,
                            return_train_score=False,
                            verbose=100)

##################################################################
from dask_kubernetes import KubeCluster
from dask.distributed import Client
import os
os.environ['SKLEARN_SITE_JOBLIB'] = "1"
from dask.distributed import Client
from sklearn import datasets, linear_model
from sklearn.model_selection import cross_validate
Beispiel #11
0
def main():
    print(args)

    pipeline = Pipeline([
        ("std", None),
        ("dim_red", None),
        ("clf", None),
    ])

    n_components_gs = [3, 5, 15]

    if args.model == "SVC":
        SVM = SVC
        params = [
            {
                "std": [StandardScaler()] if args.pca else
                [MinMaxScaler(), StandardScaler(), None],
                "dim_red": [PCA()] if args.pca else [None],
                "clf": [SVM()],
                "clf__kernel": ["rbf", "poly"],
                "clf__C": [1e-2, 1e-1, 1e0, 1e1, 1e2, 1e3],
                "clf__gamma": ["scale"],
            },
        ]
        if args.pca:
            params[0]["dim_red__n_components"] = n_components_gs
    elif args.model == "LSVC":
        LinearSVM = LinearSVC
        params = [
            {
                "std": [MinMaxScaler(), StandardScaler(), None],
                "dim_red": [PCA()] if args.pca else [None],
                "clf": [LinearSVM()],
                "clf__C": [1e-2, 1e-1, 1e0, 1e1, 1e2, 1e3],
                "clf__max_iter": [100000],
            },
        ]
        if args.pca:
            params[0]["dim_red__n_components"] = n_components_gs
    elif args.model == "LR":
        LR = LogisticRegression
        params = [
            {
                "std": [MinMaxScaler(), StandardScaler(), None],
                "dim_red": [PCA()] if args.pca else [None],
                "clf": [LR()],
            },
        ]
        if args.pca:
            params[0]["dim_red__n_components"] = n_components_gs
    elif args.model == 'LASSO':
        lasso = SGDClassifier
        loss = ''
        params = [
            {
                'std': [MinMaxScaler(), StandardScaler(), None],
                'dim_red': [PCA()] if args.pca else [None],
                'clf': [lasso()],
                'clf__loss': ['squared_loss'],
                'clf__penalty': ['l1'],
                'clf__alpha': [1e-3, 1e-2, 1e-1, 1e0, 1e1, 1e2, 1e3],
            },
        ]
        if args.pca:
            params[0]['dim_red__n_components'] = n_components_gs
    elif args.model == "RF":
        RF = RandomForestClassifier
        params = [
            {
                "std": [MinMaxScaler(), StandardScaler(), None],
                "dim_red": [PCA()] if args.pca else [None],
                "clf": [RF()],
                "clf__n_estimators": [
                    500,
                ],
                "clf__max_depth": [None, 2, 8, 16],
                "clf__min_samples_split": [2, 0.1, 0.5],
                "clf__max_features": ["sqrt", "log2"],
            },
        ]
        if args.pca:
            params[0]["dim_red__n_components"] = n_components_gs
    elif args.model == "GB":
        GB = GradientBoostingClassifier
        loss_list = ["deviance", "exponential"]
        params = [
            {
                "std": [MinMaxScaler(), StandardScaler(), None],
                "dim_red": [PCA()] if args.pca else [None],
                "clf": [GB()],
                "clf__loss": loss_list,
                "clf__learning_rate": [0.1, 0.01, 0.001],
                "clf__n_estimators": [32, 100, 500],
                "clf__max_depth": [2, 8, 16],
                "clf__min_samples_split": [2, 0.1, 0.5],
            },
        ]
        if args.pca:
            params[0]["dim_red__n_components"] = n_components_gs

    X, y = read_static_data(args.data,
                            args.labels,
                            args.include,
                            skip_control=not args.with_control)
    print(f"X shape: {X.shape} y shape: {y.shape}")
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.5,
                                                        random_state=42,
                                                        stratify=y)
    print(f'trainset size: {X_train.shape[0]}')

    results_dir = args.results_dir
    results_dir.mkdir(parents=True, exist_ok=True)
    included = set(args.include)
    included = ("all" if included == {"FC", "REHO", "ALFF", "fALFF"} else
                "_".join(sorted(included)))
    filename_infix = f"{included}{'_pca' if args.pca else ''}_{args.model}"
    model_path = results_dir / f"model_{filename_infix}.joblib"
    results_path = results_dir / f"results_{filename_infix}.txt"
    importances_path = results_dir / f"importances_{filename_infix}.csv"
    roc_fpr_path = results_dir / f"roc_fpr_{filename_infix}.csv"
    roc_tpr_path = results_dir / f"roc_tpr_{filename_infix}.csv"
    roc_thr_path = results_dir / f"roc_thr_{filename_infix}.csv"

    with parallel_backend("loky", n_jobs=args.jobs), open(results_path,
                                                          "w") as results_file:
        folding = StratifiedKFold(5)
        scoring = make_scorer(roc_auc_score, needs_threshold=True)
        if not model_path.exists():
            gs = GridSearchCV(pipeline,
                              params,
                              scoring=scoring,
                              cv=folding,
                              verbose=1)
            gs.fit(X_train, y_train)
            # save model
            joblib.dump(gs, model_path.resolve())
        else:
            gs = joblib.load(model_path.resolve())
        print(f"Best params: {gs.best_params_}", file=results_file)
        # print variance explained for PCA
        if args.pca:
            pca = gs.best_estimator_["dim_red"]
            print(
                f"PCA variance explained: \n {pca.explained_variance_ratio_.tolist()}",
                file=results_file,
            )
        # print feature importance on Random Forest
        if args.model == "RF":
            rf = gs.best_estimator_["clf"]
            importances = rf.feature_importances_
            if args.pca:
                importances = pca.inverse_transform(importances)
                importances = np.absolute(importances)
                importances /= np.sum(importances)
            np.savetxt(importances_path, [importances], delimiter=',')
        elif args.model == "LR" or args.model == "LASSO":
            model = gs.best_estimator_["clf"]
            importances = model.coef_
            if args.pca:
                importances = pca.inverse_transform(importances)
            importances = np.absolute(importances)
            importances /= np.sum(importances)
            np.savetxt(importances_path, importances, delimiter=',')
        # validate
        y_test_pred = gs.predict(X_test)
        if hasattr(gs, 'decision_function'):
            y_scores = gs.decision_function(X_test)
        elif hasattr(gs, 'predict_proba'):
            y_scores = gs.predict_proba(X_test)
            if y_scores.ndim > 1:
                y_scores = y_scores[:, 1]
        else:
            y_scores = None
        acc = accuracy_score(y_test, y_test_pred)
        tn, fp, fn, tp = confusion_matrix(y_test, y_test_pred).ravel()
        specificity = tn / (tn + fp)
        precision = tp / (tp + fp)
        print(f"Accuracy: {acc}", file=results_file)
        print(f"Specificity: {specificity}", file=results_file)
        print(f"Precision: {precision}", file=results_file)
        if y_scores is not None:
            auroc = roc_auc_score(y_test, y_scores)
            print(f"AUROC: {auroc}", file=results_file)
            fprs, tprs, thrhlds = roc_curve(y_test, y_scores)
            np.savetxt(roc_fpr_path, fprs, delimiter=',')
            np.savetxt(roc_tpr_path, tprs, delimiter=',')
            np.savetxt(roc_thr_path, thrhlds, delimiter=',')
            # validation score
            print(f"Validation AUROC: {gs.best_score_}", file=results_file)
        # test on trainset
        y_train_pred = gs.predict(X_train)
        if hasattr(gs, 'decision_function'):
            y_train_scores = gs.decision_function(X_train)
        elif hasattr(gs, 'predict_proba'):
            y_train_scores = gs.predict_proba(X_train)
            if y_train_scores.ndim > 1:
                y_train_scores = y_train_scores[:, 1]
        else:
            y_train_scores = None
        train_acc = accuracy_score(y_train, y_train_pred)
        print(f"Train accuracy: {train_acc}", file=results_file)
        if y_train_scores is not None:
            train_auroc = roc_auc_score(y_train, y_train_scores)
            print(f"Train AUROC: {train_auroc}", file=results_file)
        # calculate p-values
        best_estimator = gs.best_estimator_
        pvalue, permutation_scores = p_value_permute(best_estimator, auroc,
                                                     scoring, X_train, y_train,
                                                     X_test, y_test)
        print(f"Test p-value: {pvalue}", file=results_file)
        # print(f"Test permutation scores: {permutation_scores}", file=results_file)

    if args.shell:
        IPython.embed()
Beispiel #12
0
    def dask_clusterMethod(self, cluster_method, mname, data):
        try:
            logger.info('[{}] : [INFO] Loading Clustering method {}'.format(
                datetime.fromtimestamp(
                    time.time()).strftime('%Y-%m-%d %H:%M:%S'),
                type(cluster_method)))
            # delattr(cluster_method, 'behaviour')
            # del cluster_method.__dict__['behaviour']
            for k, v in cluster_method.get_params().items():
                logger.info(
                    '[{}] : [INFO] Method parameter {} set to {}'.format(
                        datetime.fromtimestamp(
                            time.time()).strftime('%Y-%m-%d %H:%M:%S'), k, v))
            try:
                with joblib.parallel_backend('dask'):
                    logger.info(
                        '[{}] : [INFO] Using Dask backend for user defined method'
                        .format(
                            datetime.fromtimestamp(
                                time.time()).strftime('%Y-%m-%d %H:%M:%S')))
                    clf = cluster_method.fit(data)
            except Exception as inst:
                logger.error(
                    '[{}] : [ERROR] Failed to fit user defined method with dask backend with {} and {}'
                    .format(
                        datetime.fromtimestamp(
                            time.time()).strftime('%Y-%m-%d %H:%M:%S'),
                        type(inst), inst.args))
                logger.warning(
                    '[{}] : [WARN] using default process based backend for user defined method'
                    .format(
                        datetime.fromtimestamp(
                            time.time()).strftime('%Y-%m-%d %H:%M:%S')))
                clf = cluster_method.fit(data)
        except Exception as inst:
            logger.error(
                '[{}] : [ERROR] Failed to fit {} with {} and {}'.format(
                    datetime.fromtimestamp(
                        time.time()).strftime('%Y-%m-%d %H:%M:%S'),
                    type(cluster_method), type(inst), inst.args))
            sys.exit(1)
        predictions = clf.predict(data)
        if list(np.unique(predictions)) == [0, 1]:
            anomaly_marker = 1
            normal_marker = 0
        else:
            anomaly_marker = -1
            normal_marker = 1
        logger.info(
            '[{}] : [INFO] Number of Predicted Anomalies {} from a total of {} datapoints.'
            .format(
                datetime.fromtimestamp(
                    time.time()).strftime('%Y-%m-%d %H:%M:%S'),
                list(predictions).count(anomaly_marker),
                len(list(predictions))))
        logger.debug('[{}] : [DEBUG] Predicted Anomaly Array {}'.format(
            datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'),
            predictions))
        fname = str(clf).split('(')[0]
        self.__serializemodel(clf, fname, mname)
        self.__plot_feature_sep(data,
                                predictions,
                                method=fname,
                                mname=mname,
                                anomaly_label=anomaly_marker,
                                normal_label=normal_marker)
        self.__decision_boundary(clf,
                                 data,
                                 method=fname,
                                 mname=mname,
                                 anomaly_label=anomaly_marker)

        return clf
Beispiel #13
0
    def dask_isolationForest(self, settings, mname, data):
        '''
        :param settings: -> settings dictionary
        :param mname: -> name of serialized clusterer
        :param scaler: -> scaler to use on data
        :return: -> isolation forest instance
        :example settings: -> {n_estimators:100, max_samples:100, contamination:0.1, bootstrap:False,
                        max_features:1.0, n_jobs:1, random_state:None, verbose:0}
        '''
        if not settings or settings is None:
            logger.warning(
                '[{}] : [WARN] No IsolationForest parameters defined using defaults'
                .format(
                    datetime.fromtimestamp(
                        time.time()).strftime('%Y-%m-%d %H:%M:%S')))
            # print(settings)
            settings = {}
        else:
            for k, v in settings.items():
                logger.info(
                    '[{}] : [INFO] IsolationForest parameter {} set to {}'.
                    format(
                        datetime.fromtimestamp(
                            time.time()).strftime('%Y-%m-%d %H:%M:%S'), k, v))
        try:

            clf = IsolationForest(**settings)
            # print(clf)
        except Exception as inst:
            logger.error(
                '[{}] : [INFO] Failed to instanciate IsolationForest with {} and {}'
                .format(
                    datetime.fromtimestamp(
                        time.time()).strftime('%Y-%m-%d %H:%M:%S'), type(inst),
                    inst.args))
            sys.exit(1)

        try:
            with joblib.parallel_backend('dask'):
                logger.info(
                    '[{}] : [INFO] Using Dask backend for IsolationForest'.
                    format(
                        datetime.fromtimestamp(
                            time.time()).strftime('%Y-%m-%d %H:%M:%S')))
                clf.fit(data)
        except Exception as inst:
            logger.error(
                '[{}] : [ERROR] Failed to fit IsolationForest with {} and {}'.
                format(
                    datetime.fromtimestamp(
                        time.time()).strftime('%Y-%m-%d %H:%M:%S'), type(inst),
                    inst.args))
            sys.exit(1)

        predict = clf.predict(data)
        anoOnly = np.argwhere(predict == -1)
        logger.info(
            '[{}] : [INFO] Found {} anomalies in training dataset of shape {}.'
            .format(
                datetime.fromtimestamp(
                    time.time()).strftime('%Y-%m-%d %H:%M:%S'), len(anoOnly),
                data.shape))
        logger.info('[{}] : [DEBUG] Predicted Anomaly Array {}'.format(
            datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'),
            predict))
        self.__serializemodel(clf, 'isoforest', mname)
        self.__appendPredictions(method='isoforest',
                                 mname=mname,
                                 data=data,
                                 pred=predict)
Beispiel #14
0
fit_data = namedtuple(
    "FitData",
    (
        "endog_data",
        "exog_data",
        "master_mask",
        "filled_datasets",
        "masked_datasets",
        "land_mask",
    ),
)

n_jobs = 5
with parallel_backend("loky",
                      n_jobs=n_jobs,
                      inner_max_num_threads=math.floor(get_ncpus() / n_jobs)):
    outputs = thres_fire_season_stats(0.1)

fire_season_mask = [out for out in outputs if out[0] == ba_dataset][0][4]

shift_months = [1, 3, 6, 12, 24]

selection_variables = (
    "VOD Ku-band -3 Month",
    "SIF",
    "VOD Ku-band -1 Month",
    "Dry Day Period -3 Month",
    "FAPAR",
    "pftHerb",
    "LAI -1 Month",
Beispiel #15
0
    def training_loop(self) -> None:
        register_ray()

        self.estimator.set_params(**self.params)

        datasets = self._get_datasets()
        X_train, y_train = datasets.pop(TRAIN_DATASET_KEY)
        groups = None
        if "cv_groups" in X_train.columns:
            groups = X_train["cv_groups"]
            X_train = X_train.drop("cv_groups", axis=1)

        scaling_config_dataclass = self._validate_and_get_scaling_config_data_class(
            self.scaling_config
        )

        num_workers = scaling_config_dataclass.num_workers or 0
        assert num_workers == 0  # num_workers is not in scaling config allowed_keys

        trainer_resources = scaling_config_dataclass.trainer_resources or {"CPU": 1}
        has_gpus = bool(trainer_resources.get("GPU", 0))
        num_cpus = int(trainer_resources.get("CPU", 1))

        # see https://scikit-learn.org/stable/computing/parallelism.html
        os.environ["OMP_NUM_THREADS"] = str(num_cpus)
        os.environ["MKL_NUM_THREADS"] = str(num_cpus)
        os.environ["OPENBLAS_NUM_THREADS"] = str(num_cpus)
        os.environ["BLIS_NUM_THREADS"] = str(num_cpus)

        parallelize_cv = self._get_cv_parallelism(has_gpus)
        if self.set_estimator_cpus:
            num_estimator_cpus = 1 if parallelize_cv else num_cpus
            _set_cpu_params(self.estimator, num_estimator_cpus)

        with parallel_backend("ray", n_jobs=num_cpus):
            start_time = time()
            self.estimator.fit(X_train, y_train, **self.fit_params)
            fit_time = time() - start_time

            with tune.checkpoint_dir(step=1) as checkpoint_dir:
                with open(os.path.join(checkpoint_dir, MODEL_KEY), "wb") as f:
                    cpickle.dump(self.estimator, f)

                if self.preprocessor:
                    save_preprocessor_to_dir(self.preprocessor, checkpoint_dir)

            if self.label_column:
                validation_set_scores = self._score_on_validation_sets(
                    self.estimator, datasets
                )
                cv_scores = self._score_cv(
                    self.estimator,
                    X_train,
                    y_train,
                    groups,
                    # if estimator has parallelism, use that. Otherwise,
                    # parallelize CV
                    n_jobs=1 if not parallelize_cv else num_cpus,
                )
            else:
                validation_set_scores = {}
                cv_scores = {}

        # cv_scores will not override validation_set_scores as we
        # check for that during initialization
        results = {
            **validation_set_scores,
            **cv_scores,
            "fit_time": fit_time,
        }
        tune.report(**results)
Beispiel #16
0
    def train(self,
              df_model_train,
              model_name,
              df_model_valid=None,
              weight=None):

        model_q, evals_result_q = {}, {}
        if 'mean' in self.regression_params['type']:
            num_rounds, early_stopping = self.determine_num_rounds(
                df_model_train, model_name, objective='mean', weight=weight)
            # Train model for mean
            model, evals_result = self.create_fit_model(
                model_name,
                df_model_train,
                objective='mean',
                df_model_valid=df_model_valid,
                weight=weight,
                num_rounds=num_rounds,
                early_stopping=early_stopping)

            model_q['mean'] = model
            evals_result_q['mean'] = evals_result

        if 'quantile' in self.regression_params['type']:
            num_rounds, early_stopping = self.determine_num_rounds(
                df_model_train,
                model_name,
                objective='quantile',
                weight=weight)

            # Train models for different quantiles
            with joblib.parallel_backend(self.parallel_processing['backend']):
                results = joblib.Parallel(
                    n_jobs=self.parallel_processing['n_workers'])(
                        joblib.delayed(self.create_fit_model)(
                            model_name,
                            df_model_train,
                            objective='quantile',
                            alpha=alpha,
                            df_model_valid=df_model_valid,
                            weight=weight,
                            num_rounds=num_rounds,
                            early_stopping=early_stopping)
                        for alpha in self.alpha_q)

            for (model, evals_result), alpha in zip(results, self.alpha_q):
                model_q['quantile{0:.2f}'.format(alpha)] = model
                evals_result_q['quantile{0:.2f}'.format(alpha)] = evals_result

        if not (('mean' in self.regression_params['type']) or
                ('quantile' in self.regression_params['type'])):
            raise ValueError(
                'Value of regression parameter "objective" not recognized.')

        # Convert evals_result_q to dataframe
        data = {(level1_key, level2_key): pd.Series(values)
                for level1_key in evals_result_q.keys()
                for level2_key, values in evals_result_q[level1_key].items()}
        df_evals_result_q = pd.DataFrame(data)
        df_evals_result_q.index.name = 'iterations'

        return model_q, df_evals_result_q
Beispiel #17
0
#---#


def f(i):
    return i * 10


#---#

trials = 100

#---#

print(trials)

with parallel_backend('loky', n_jobs=2):
    lst = Parallel()(delayed(f)(i) for i in range(trials))

#chk>

print(lst)

#---#

lst2 = [2 * i for i in range(trials)]

#---#

print(lst2)
Beispiel #18
0

if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("-n_jobs", type=int, required=True)
    parser.add_argument("-num_summaries", type=int, required=True)
    parser.add_argument("-in_dir", type=str, required=True)
    parser.add_argument("-out_dir", type=str, required=True)
    args = parser.parse_args()

    assert os.path.isdir(args.in_dir)
    assert os.path.isdir(args.out_dir)

    # Read summary files in parallel
    input_fnames = listdir_fullpath(args.in_dir)
    with parallel_backend('multiprocessing', n_jobs=args.n_jobs):
        all_summaries = Parallel()(delayed(read_out_file)(idx, fname)
                                   for idx, fname in enumerate(input_fnames))
    # sort summaries according to document number
    all_summaries = sorted(all_summaries, key=lambda x: x[0])
    all_summaries = [tup[1] for tup in all_summaries]
    with parallel_backend('multiprocessing', n_jobs=args.n_jobs):
        unique_summaries = Parallel()(
            delayed(remove_duplicates)(idx, summaries)
            for idx, summaries in enumerate(all_summaries))

    output_fnames = [
        args.out_dir + "/out_{}.txt".format(i)
        for i in range(args.num_summaries)
    ]
    write_to_files(unique_summaries, output_fnames)
        AID_path = os.path.join(r'C:\Users\gdrei\Dropbox\UCL\Thesis\Data', AID)
    else:
        AID_path = os.path.join('/home/gabriel/Dropbox/UCL/Thesis/Data/', AID)
    save_path = AID_path + '/' + AID + 'graph_processed.pkl'
    pickle_off = open(save_path, 'rb')
    activity_table = pickle.load(pickle_off)
    pickle_off.close()
    '''Pick diverse starting point of 10% of library'''
    fplist = [x for x in activity_table['bit_MFP']]
    '''start_indexs holds the indexes of molecules already scanned at the
        start of each iteration. So for the first iter it hold the diversity selection. 
        For the second, it holds both the diversity selection and the molecules 
        screened based on the results of the first training iteration etc'''
    #build up metalists that will vary for each repetition
    start_num = int(len(fplist) * 0.15)
    with parallel_backend('multiprocessing'):
        start_index_metalist = Parallel(n_jobs=3)(
            delayed(getNextIterInds)(
                firstPicksList=i, fplist=j, bottom_to_select=k)
            for i, j, k in zip([[], [], []], [fplist, fplist, fplist],
                               [start_num, start_num, start_num]))
#            start_indexs = np.array(mmp.LazyBitVectorPick(fplist,len(fplist),int(len(fplist)/10)))
    '''store in a list that will vary as each model makes its predictions'''
    for rep_num in range(3):
        #set rep specific variables
        metric_dict_list = metric_dict_metalist[rep_num]
        start_indexs = start_index_metalist[rep_num]
        multi_dump_path = os.path.join(
            '/home/gabriel/Dropbox/UCL/Thesis/Data/',
            exper_file_name + str(rep_num) + '.pkl')
        #everything from here forwards doesn't need to change
Beispiel #20
0
def joblib_process(sub_f, *args):
    result = []
    with parallel_backend('multiprocessing', n_jobs=-1):
        res = Parallel()(delayed(sub_f)(*[img, args[0], args[1]]) for img in args[0])
    return res
Beispiel #21
0
 def cal_factors(self, start, end, n_jobs):
     # type要用408001000,408005000,408004000(合并报表,合并更正前,合并调整后),同时有408001000和408005000用408005000
     # 有408004000时,根据ann_dt酌情使用
     # 目前包含字段: 净利润(net_profit),扣非净利润(net_profit_ddt),营收(oper_rev),总营收(tot_oper_rev),
     #              营业利润(oper_profit),摊薄eps(EPS_diluted),经营利润(oper_income),少数股东损益(minority_int_inc),
     #              财务费用(less_fin_exp),利息净收入(net_int_inc), 息税前利润(EBIT),报告类型(statement_type)
     query = "select ANN_DT, S_INFO_WINDCODE, REPORT_PERIOD, NET_PROFIT_EXCL_MIN_INT_INC, " \
             "NET_PROFIT_AFTER_DED_NR_LP, OPER_REV, TOT_OPER_REV, TOT_OPER_COST, OPER_PROFIT, TOT_PROFIT, " \
             "S_FA_EPS_DILUTED, MINORITY_INT_INC, LESS_FIN_EXP, NET_INT_INC, EBIT, STATEMENT_TYPE " \
             "from wind_filesync.AShareIncome " \
             "where ANN_DT >= {0} and ANN_DT <= {1} " \
             "and (STATEMENT_TYPE = '408001000' or STATEMENT_TYPE = '408005000' or STATEMENT_TYPE = '408004000') " \
             "and (s_info_windcode like '0%' or s_info_windcode like '3%' or s_info_windcode like '6%') " \
             "order by report_period, ann_dt, statement_type " \
         .format((dtparser.parse(str(start)) - relativedelta(years=4)).strftime('%Y%m%d'), str(end))
     self.rdf.curs.execute(query)
     income = \
         pd.DataFrame(self.rdf.curs.fetchall(),
                      columns=['date', 'code', 'report_period', 'net_profit', 'net_profit_ddt', 'oper_rev',
                               'tot_oper_rev', 'tot_oper_cost', 'oper_profit', 'tot_profit', 'EPS_diluted',
                               'minority_interest_income', 'less_fin_exp', 'net_interest_income', 'EBIT', 'type'])
     income[['minority_interest_income', 'less_fin_exp', 'net_interest_income']] = \
         income[['minority_interest_income', 'less_fin_exp', 'net_interest_income']].fillna(0)
     # 同一code,同一date,同一report_period,同时出现type1,2,3时,取type大的
     income['type'] = income['type'].apply(
         lambda x: '2'
         if x == '408001000' else ('3' if x == '408005000' else '4'))
     income = income.sort_values(
         by=['code', 'date', 'report_period', 'type'])
     income['date'] = pd.to_datetime(income['date'])
     income['report_period'] = pd.to_datetime((income['report_period']))
     # ***************************************************************************
     # 读取业绩快报
     query = "select ANN_DT, S_INFO_WINDCODE, REPORT_PERIOD, OPER_REV, OPER_PROFIT, NET_PROFIT_EXCL_MIN_INT_INC, " \
             "TOT_PROFIT, EPS_DILUTED " \
             "from wind_filesync.AShareProfitExpress " \
             "where ANN_DT >= {0} and ANN_DT <= {1} " \
             "and (s_info_windcode like '0%' or s_info_windcode like '3%' or s_info_windcode like '6%') " \
             "order by report_period, ann_dt" \
         .format((dtparser.parse(str(start)) - relativedelta(years=4)).strftime('%Y%m%d'), str(end))
     self.rdf.curs.execute(query)
     express = pd.DataFrame(self.rdf.curs.fetchall(),
                            columns=[
                                'date', 'code', 'report_period', 'oper_rev',
                                'oper_profit', 'net_profit', 'tot_profit',
                                'EPS_diluted'
                            ])
     express['date'] = pd.to_datetime(express['date'])
     express['report_period'] = pd.to_datetime(express['report_period'])
     express['type'] = '1'
     # ***************************************************************************
     # 读取业绩预告
     query = "select S_PROFITNOTICE_DATE, S_INFO_WINDCODE, S_PROFITNOTICE_PERIOD, S_PROFITNOTICE_NETPROFITMIN, " \
             "S_PROFITNOTICE_NETPROFITMAX " \
             "from wind_filesync.AShareProfitNotice " \
             "where S_PROFITNOTICE_DATE >= {0} and S_PROFITNOTICE_DATE <= {1} " \
             "and (s_info_windcode like '0%' or s_info_windcode like '3%' or s_info_windcode like '6%') " \
             "order by S_PROFITNOTICE_PERIOD, S_PROFITNOTICE_DATE" \
         .format((dtparser.parse(str(start)) - relativedelta(years=4)).strftime('%Y%m%d'), str(end))
     self.rdf.curs.execute(query)
     notice = pd.DataFrame(self.rdf.curs.fetchall(),
                           columns=[
                               'date', 'code', 'report_period',
                               'net_profit_min', 'net_profit_max'
                           ])
     notice['date'] = pd.to_datetime(notice['date'])
     notice['report_period'] = pd.to_datetime(notice['report_period'])
     notice['type'] = '0'
     notice[['net_profit_min', 'net_profit_max']] = \
         notice[['net_profit_min', 'net_profit_max']].fillna(method='bfill', axis=1)
     notice[['net_profit_min', 'net_profit_max']] = \
         notice[['net_profit_min', 'net_profit_max']].fillna(method='ffill', axis=1)
     # 业绩预告的单位为: 万元
     notice['net_profit'] = (0.5 * notice['net_profit_min'] +
                             0.5 * notice['net_profit_max']) * 10000
     notice.drop(['net_profit_min', 'net_profit_max'], axis=1, inplace=True)
     # ***************************************************************************
     income = pd.concat([income, express, notice], ignore_index=True)
     income = income.sort_values(
         by=['code', 'date', 'report_period', 'type'])
     # 经营利润 = 净利润(含少数股东损益) - 非经常性损益 + 财务费用 * (1-0.25) - 利息净收入 * (1-0.25)
     #         = 扣非净利润(扣除少数股东损益) + 少数股东损益 + 财务费用 * (1-0.25) - 利息净收入 * (1-0.25)
     income['oper_income'] = income['net_profit_ddt'] + income['minority_interest_income'] + \
                             income['less_fin_exp'] * (1 - 0.25) - income['net_interest_income'] * (1 - 0.25)
     income[
         'gross_margin'] = income['tot_oper_rev'] - income['tot_oper_cost']
     # 需要的field
     fields = [
         'net_profit', 'net_profit_ddt', 'oper_rev', 'tot_oper_rev',
         'tot_oper_cost', 'oper_profit', 'tot_profit', 'gross_margin',
         'EPS_diluted', 'oper_income', 'EBIT'
     ]
     #fields = ['EBIT']
     # 处理数据
     calendar = self.rdf.get_trading_calendar()
     calendar = \
         set(calendar.loc[(calendar >= (dtparser.parse(str(start)) - relativedelta(years=2)).strftime('%Y%m%d')) &
                          (calendar <= str(end))])
     # 存放的db
     save_db = 'FinancialReport_Gus'
     fail_list = []
     for f in fields:
         print('ALL ANNOUNCEMENT \n field: %s begins processing...' % f)
         df = pd.DataFrame(income.dropna(subset=[f]).groupby(['code', 'date', 'report_period'])[f].last()) \
             .reset_index()
         df = df.sort_values(by=['report_period', 'date'])
         df.set_index(['code', 'date', 'report_period'], inplace=True)
         df = df.unstack(level=2)
         df = df.loc[:, f]
         df = df.reset_index().set_index('date')
         codes = df['code'].unique()
         split_codes = np.array_split(codes, n_jobs)
         with parallel_backend('multiprocessing', n_jobs=n_jobs):
             res = Parallel()(delayed(IncomeUpdate.JOB_factors)(
                 df, f, codes, calendar, start, save_db)
                              for codes in split_codes)
         print('%s finish' % f)
         print('-' * 30)
         for r in res:
             fail_list.extend(r)
     return fail_list
# test
# date_family_size_list = list(zip(['2020-07-24', '2020-07-24'], ['VESTIDO', 'VESTIDO'], ['M', 'XXXL']))

# result
# Out[14]:
#          date family_desc  size  mean_weight_relative  mean_weight_abs      stock_nok
# 0  2020-07-24     VESTIDO     M              0.472393         0.032209        0
# 1  2020-07-24     VESTIDO  XXXL              0.722222         0.506944        1
#

#### test end #########

######################################################################################################################
# run

with parallel_backend('threading', n_jobs=6):
    date_family_size_var_valor_list = Parallel()(
        delayed(get_var_distr_relat_abs)(date_family_size, df, path_results)
        for date_family_size in date_family_size_list)

df_indicators = pd.concat(date_family_size_var_valor_list)

# wothout distr_relative

df_indicators_gr = df_indicators.groupby(['date', 'family_desc', 'size']).agg({
    'distr_abs':
    'mean'
}).reset_index()

# with distr_relative
# df_indicators_label = pd.merge(df_indicators, df_feedback, on=['date', 'family_desc', 'size'])
        full_input_image: xa.DataArray = xa.open_rasterio(image_data_path,
                                                          chunks=(35, 1000,
                                                                  1000))
        input_image = full_input_image[:, 1100:1400, 1100:
                                       1400] if subset else full_input_image
        space_coords = {
            key: input_image.coords[key].values
            for key in space_dims
        }
        ml_input_data: np.ndarray = preprocess(input_image)
        nodata_output = estimator.predict(np.zeros([1,
                                                    input_image.shape[0]]))[0]

        t1 = time.time()

        with joblib.parallel_backend('dask'):
            print(
                f"Executing {modelType} estimator: {saved_model_path}, parameters: { list(estimator.instance_parameters.items()) }"
            )
            ml_results: np.ndarray = estimator.predict(ml_input_data)

        t2 = time.time()

        depth_map_data: np.ndarray = ml_results.reshape(input_image.shape[1:])
        result_map = xa.DataArray(depth_map_data,
                                  coords=space_coords,
                                  dims=space_dims,
                                  name="depth_map")
        depth_map = result_map.where(result_map != nodata_output, 0.0)

        t3 = time.time()
Beispiel #24
0
 def process_data(self, start, end, n_jobs):
     calendar = self.rdf.get_trading_calendar()
     calendar = calendar[(calendar >= str(start)) & (calendar <= str(end))]
     # 获取50权重
     weight_50 = self.idx_comp_sql.get_IndexComp(50, start, end)
     weight_50['index_code'] = '000016.SH'
     miss_dates = set(calendar) - set(weight_50.index.unique())
     if miss_dates:
         miss_dates = pd.DatetimeIndex(miss_dates).strftime('%Y%m%d')
         if miss_dates.shape[0] == 1:
             query = "select TRADE_DT,S_INFO_WINDCODE,S_CON_WINDCODE,weight " \
                     "from wind_filesync.AIndexSSE50Weight " \
                     "where TRADE_DT = {0}".format(miss_dates[0])
         else:
             query = "select TRADE_DT,S_INFO_WINDCODE,S_CON_WINDCODE,weight " \
                     "from wind_filesync.AIndexSSE50Weight " \
                     "where TRADE_DT in " + str(tuple(miss_dates))
         self.rdf.curs.execute(query)
         miss_df = pd.DataFrame(
             self.rdf.curs.fetchall(),
             columns=['date', 'index_code', 'code', 'weight'])
         miss_df['date'] = pd.to_datetime(miss_df['date'])
         miss_df.set_index('date', inplace=True)
         weight_50 = pd.concat([weight_50, miss_df])
     # 获取300权重
     weight_300 = self.idx_comp_sql.get_IndexComp(300, start, end)
     weight_300['index_code'] = '000300.SH'
     miss_dates = set(calendar) - set(weight_300.index.unique())
     if miss_dates:
         dates_before_miss = {}
         for d in miss_dates:
             if calendar[calendar < d].empty:
                 pass
             else:
                 dates_before_miss[calendar[calendar < d].iloc[-1].strftime(
                     '%Y%m%d')] = d.strftime('%Y%m%d')
         if len(dates_before_miss) == 1:
             query = "select TRADE_DT,S_INFO_WINDCODE,S_CON_WINDCODE,i_weight " \
                     "from wind_filesync.AIndexHS300Weight " \
                     "where TRADE_DT = {0}".format(list(dates_before_miss.keys())[0])
         else:
             query = "select TRADE_DT,S_INFO_WINDCODE,S_CON_WINDCODE,i_weight " \
                     "from wind_filesync.AIndexHS300Weight " \
                     "where TRADE_DT in " + str(tuple(dates_before_miss.keys()))
         self.rdf.curs.execute(query)
         miss_df = pd.DataFrame(
             self.rdf.curs.fetchall(),
             columns=['last_date', 'index_code', 'code', 'weight'])
         miss_df['date'] = miss_df['last_date'].map(dates_before_miss)
         miss_df['date'] = pd.to_datetime(miss_df['date'])
         miss_df.drop('last_date', axis=1, inplace=True)
         miss_df.set_index('date', inplace=True)
         weight_300 = pd.concat([weight_300, miss_df])
     # 获取500权重
     weight_500 = self.idx_comp_sql.get_IndexComp(500, start, end)
     weight_500['index_code'] = '000905.SH'
     miss_dates = set(calendar) - set(weight_500.index.unique())
     if miss_dates:
         miss_dates = pd.DatetimeIndex(miss_dates).strftime('%Y%m%d')
         if miss_dates.shape[0] == 1:
             query = "select TRADE_DT,S_INFO_WINDCODE,S_CON_WINDCODE,weight " \
                     "from wind_filesync.AIndexCSI500Weight " \
                     "where TRADE_DT = {0}".format(miss_dates[0])
         else:
             query = "select TRADE_DT,S_INFO_WINDCODE,S_CON_WINDCODE,weight " \
                     "from wind_filesync.AIndexCSI500Weight " \
                     "where TRADE_DT in " + str(tuple(miss_dates))
         self.rdf.curs.execute(query)
         miss_df = pd.DataFrame(
             self.rdf.curs.fetchall(),
             columns=['date', 'index_code', 'code', 'weight'])
         miss_df['date'] = pd.to_datetime(miss_df['date'])
         miss_df.set_index('date', inplace=True)
         weight_500 = pd.concat([weight_500, miss_df])
     ########################################################################
     weight = pd.concat([weight_50, weight_300, weight_500])
     weight['weight'] = weight['weight'].astype('float')
     codes = weight['code'].unique()
     split_codes = np.array_split(codes, n_jobs)
     with parallel_backend('multiprocessing', n_jobs=n_jobs):
         res = Parallel()(delayed(influxdbData.JOB_saveData)(
             weight, 'code', codes, self.db, self.measure)
                          for codes in split_codes)
     print('IndexWeight finish')
     print('-' * 30)
     fail_list = []
     for r in res:
         fail_list.extend(r)
     return fail_list
Beispiel #25
0
def npoclass(inputs,
             gpu_core=True,
             model_path=None,
             ntee_type='bc',
             n_jobs=4,
             backend='multiprocessing',
             batch_size_dl=64,
             verbose=1):

    # Set the seed value all over the place to make this reproducible.
    seed_val = 42
    random.seed(seed_val)
    np.random.seed(seed_val)
    torch.manual_seed(seed_val)

    # Check model files.
    if ntee_type == 'bc' and model_path == None:
        raise ValueError(
            "Make sure model files/path are correct. Please download from https://jima.me/open/npoclass_model_bc.zip, unzip, and specifiy model_path (default set to None)."
        )
    if ntee_type == 'mg' and model_path == None:
        raise ValueError(
            "Make sure model files/path are correct. Please download from https://jima.me/open/npoclass_model_mg.zip, unzip, and specifiy model_path (default set to None)."
        )

    # Check ntee type.
    if ntee_type == 'bc':
        le_file_name = 'le_broad_cat.pkl'
    elif ntee_type == 'mg':
        le_file_name = 'le_major_group.pkl'
    else:
        raise ValueError(
            "ntee_type must be 'bc' (broad category) or 'mg' (major group)")

    # Read model and label encoder, if not read.
    global model_loaded, tokenizer_loaded, label_encoder
    try:
        assert model_loaded
        assert tokenizer_loaded
        assert label_encoder
    except:
        #load a pretrained model and tokenizer.
        model_loaded = BertForSequenceClassification.from_pretrained(
            model_path)
        tokenizer_loaded = BertTokenizer.from_pretrained(model_path)
        # Read label encoder.
        with open(model_path + le_file_name, 'rb') as label_encoder_pkl:
            label_encoder = pickle.load(label_encoder_pkl)

    # Select acceleration method.
    if gpu_core == True and torch.cuda.is_available():
        print('There are %d GPU(s) available.' % torch.cuda.device_count(),
              'Using GPU:', torch.cuda.get_device_name(0))
        torch.cuda.manual_seed_all(seed_val)
        device = torch.device('cuda')
        model_loaded.cuda()
    else:
        print('No GPU acceleration available or gpu_core=False, using CPU.')
        device = torch.device('cpu')
        model_loaded.cpu()
    print('Encoding inputs ...')
    sleep(.5)  # Pause a second for better printing results.

    # Encode inputs.
    global func_encode_string, func_encode_string_batch  # Define as global, otherwise cannot pickle or very slow.

    def func_encode_string(text_string):
        encoded_dict = tokenizer_loaded.encode_plus(
            text_string,
            add_special_tokens=True,  # Add '[CLS]' and '[SEP]'
            truncation='longest_first',
            padding='max_length',  # Max length accepted by model.
            return_attention_mask=True,  # Construct attn. masks.
            return_tensors='pt',  # Return pytorch tensors.
        )
        return encoded_dict

    def func_encode_string_batch(text_strings):
        encoded_dicts = []
        for text_string in text_strings:
            encoded_dicts += [func_encode_string(text_string)]
        return encoded_dicts

    # Tokenize all of the sentences and map the tokens to thier word IDs.
    input_ids = []
    attention_masks = []
    # Encode input string(s).
    if type(inputs) == list:
        if backend == 'multiprocessing':  # Multiprocessing is faster than loky in processing large objects.
            encoded_outputs = Parallel(
                n_jobs=n_jobs,
                backend="multiprocessing",
                batch_size='auto',
                verbose=verbose)(delayed(func_encode_string)(text_string)
                                 for text_string in inputs)
            for encoded_output in encoded_outputs:
                # Add the encoded sentence to the list.
                input_ids.append(encoded_output['input_ids'])
                # And its attention mask (simply differentiates padding from non-padding).
                attention_masks.append(encoded_output['attention_mask'])
        elif backend == 'sequential':
            for text_string in tqdm(inputs):
                encoded_output = func_encode_string(text_string)
                # Add the encoded sentence to the list.
                input_ids.append(encoded_output['input_ids'])
                # And its attention mask (simply differentiates padding from non-padding).
                attention_masks.append(encoded_output['attention_mask'])
        elif backend == 'dask':
            with joblib.parallel_backend('dask'):
                n_jobs = len(
                    client.scheduler_info()['workers'])  # Get # works.
                string_chunks = partition_all(
                    math.ceil(len(inputs) / n_jobs),
                    inputs)  # Collect into groups of size by worker numbers.
                encoded_outputs = Parallel(
                    n_jobs=-1, batch_size='auto', verbose=verbose)(
                        delayed(func_encode_string_batch)(text_strings)
                        for text_strings in string_chunks)
                encoded_outputs = itertools.chain(*encoded_outputs)
            for encoded_output in encoded_outputs:
                # Add the encoded sentence to the list.
                input_ids.append(encoded_output['input_ids'])
                # And its attention mask (simply differentiates padding from non-padding).
                attention_masks.append(encoded_output['attention_mask'])
    if type(inputs) == str:
        encoded_output = func_encode_string(inputs)
        input_ids = [encoded_output['input_ids']]
        attention_masks = [encoded_output['attention_mask']]

    # Convert the lists into tensors.
    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)

    # Prepare dataloader for efficient calculation.
    pred_data = TensorDataset(input_ids, attention_masks)
    pred_sampler = SequentialSampler(pred_data)
    pred_dataloader = DataLoader(pred_data,
                                 sampler=pred_sampler,
                                 batch_size=batch_size_dl)

    # Start prediction.
    model_loaded.eval()
    logits_all = []
    print('Predicting categories ...')
    sleep(.5)  # Pause a second for better printing results.
    for batch in tqdm(pred_dataloader, mininterval=10):
        # Add batch to the pre-chosen device
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask = batch
        with torch.no_grad():
            outputs = model_loaded(b_input_ids,
                                   token_type_ids=None,
                                   attention_mask=b_input_mask)
        logits_all += outputs[0].tolist()

    # Calculate probabilities of logitcs.
    logits_prob = tf.nn.sigmoid(logits_all).numpy().tolist()
    # Find the positions of max values in logits.
    logits_max = np.argmax(logits_prob, axis=1)
    # Transfer to labels.
    logits_labels = label_encoder.inverse_transform(logits_max)

    # Compile results to be returned.
    result_list = []
    for list_index in range(0, len(logits_labels)):
        result_dict = {}
        result_dict['recommended'] = logits_labels[list_index]
        conf_prob = logits_prob[list_index][logits_max[list_index]]
        if conf_prob >= .99:
            result_dict['confidence'] = 'high (>=.99)'
        elif conf_prob >= .95:
            result_dict['confidence'] = 'medium (<.99|>=.95)'
        else:
            result_dict['confidence'] = 'low (<.95)'
        prob_dict = {}
        for label_index in range(0, len(label_encoder.classes_)):
            prob_dict[label_encoder.classes_[label_index]] = logits_prob[
                list_index][label_index]
        result_dict['probabilities'] = prob_dict
        result_list += [result_dict]

    return result_list
Beispiel #26
0
        else:
            raise NotImplementedError

    monitor.times = np.array(monitor.times)
    monitor.objs = np.array(monitor.objs)
    monitor.objs_test = 0  # TODO
    monitor.alphas = np.array(monitor.alphas)
    return (dataset_name, method, tol, n_outer, tolerance_decrease,
            monitor.times, monitor.objs, monitor.objs_test,
            monitor.alphas, alpha_max,
            model_name)


print("enter sequential")

with parallel_backend("loky", inner_max_num_threads=1):
    results = Parallel(n_jobs=n_jobs, verbose=100)(
        delayed(parallel_function)(
            dataset_name, method, n_outer=n_outer,
            tolerance_decrease=tolerance_decrease, tol=tol)
        for dataset_name, method, n_outer,
        tolerance_decrease in product(
            dataset_names, methods, n_outers, tolerance_decreases))
    print('OK finished parallel')

df = pd.DataFrame(results)
df.columns = [
    'dataset', 'method', 'tol', 'n_outer', 'tolerance_decrease',
    'times', 'objs', 'objs_test', 'alphas', 'alpha_max', 'model_name']

for dataset_name in dataset_names:
Beispiel #27
0
    predictors = ['TEMP', 'RH', 'PRECIP', 'U', 'V']
    dilon, dilat = 6, 4

    obs_data = xr.open_dataset(args.dataset)
    # Mask out cells where we have no PRECIP data
    mask = np.isnan(obs_data.PRECIP.isel(time=0)).rename("CONUS_MASK")

    do_hybrid = args.case == 'hybrid'
    print("Initializing model...")
    obs_model = Shen2017Model(
        obs_data,
        month=args.month,
        mask=mask,
        # lat_range=(30, 33), lon_range=(-80, -78),
        verbose=True,
        n_predictors=3,
        hybrid=do_hybrid,
        cv=args.cv)

    print("Fitting model...")
    with parallel_backend('dask.distributed', scheduler_host='localhost:8786'):
        obs_model.fit_parallel(-1)

    # Save output
    print("Saving to", args.output)
    obs_model.to_pickle(args.output)

    # Test prediction
    print("Making test prediction")
    obs_model.predict(obs_data).to_netcdf("test.pred.nc")
Beispiel #28
0
    #shutil.copy2('/Users/rogerzhu/Documents/temoa/temoa-va/virginia/data/data_virginia.xlsx', '/Users/rogerzhu/Documents/temoa/temoa-va/virginia/data/data_virginia_'+str(calendar.timegm(time.gmtime()))+'.xlsx');
    for modelInputs_XLSX, scenarioNames in zip(modelInputs_XLSX_list,
                                               scenarioNames_list):

        # =======================================================
        # Move modelInputs_XLSX to database
        # =======================================================
        modelInputs = tt.move_data_to_db(modelInputs_XLSX, path=project_path)

        # =======================================================
        # Create directories - best completed before using multiprocessing
        # =======================================================
        tt.create_dir(project_path=project_path, optional_dir='results')

        # ====================================
        # Perform Simulations
        option = 2  # 1 - Run single, 2 - Run all
        # ====================================

        if option == 1:
            # Perform single simulation
            evaluateModel(modelInputs, scenarioInputs, scenarioNames[0],
                          temoa_path)

        elif option == 2:
            # Perform simulations in parallel
            with parallel_backend('multiprocessing', n_jobs=ncpus):
                Parallel(n_jobs=ncpus, verbose=5)(delayed(evaluateModel)(
                    modelInputs, scenarioInputs, scenarioName, temoa_path,
                    project_path, solver) for scenarioName in scenarioNames)
Beispiel #29
0
def evaluate(model,
             criterion,
             input_tensor,
             target_tensor,
             BOS_token,
             device='cuda',
             EOS_token=2,
             PAD_token=0,
             TARGET_LEN=30,
             beam_size=3,
             beam_search=False):
    model.eval()
    with torch.no_grad():
        batch_size = input_tensor.size(0)
        encoder_hidden = model.encoder.initHidden(batch_size).to(device)

        input_tensor = input_tensor.transpose(0, 1).to(device)
        target_tensor = target_tensor.transpose(0, 1).to(device)

        input_length = input_tensor.size(0)
        target_length = target_tensor.size(0)

        #encoder_output, encoder_hidden = encoder(input_tensor, encoder_hidden)
        enc_outputs, encoder_hidden = model(input_tensor,
                                            encoder_hidden,
                                            batch_size,
                                            encoding=True,
                                            enc_outputs=None)

        decoder_input = torch.LongTensor(
            [BOS_token for _ in range(batch_size)]).view(-1, 1).to(device)
        encoder_hidden = encoder_hidden.view(model.encoder.n_layers,
                                             model.encoder.direction,
                                             batch_size,
                                             model.encoder.hidden_size)
        decoder_hidden = torch.cat(
            (encoder_hidden[:, 0, :, :], encoder_hidden[:, 1, :, :]), dim=2)

        def bs(i):
            start_node = BeamNode(decoder_hidden[:, i:i + 1, :].contiguous(),
                                  None, decoder_input[i, :].contiguous(), 0, 1)

            all_nodes = [start_node]
            now_nodes = [start_node]
            end_pq = PriorityQueue()

            for j in range(TARGET_LEN):
                if len(now_nodes) == 0:
                    break

                pq = PriorityQueue()

                for node in now_nodes:
                    input, hidden = node.idx, node.hidden
                    output, hidden = model(input,
                                           hidden,
                                           1,
                                           encoding=False,
                                           enc_outputs=enc_outputs[:,
                                                                   i:i + 1, :])
                    output = F.log_softmax(output, dim=1)
                    topv, topi = output.data.topk(beam_size)
                    for (score, idx) in zip(topv.detach().squeeze(0),
                                            topi.detach().squeeze(0)):
                        nxt_node = BeamNode(hidden, node, idx.unsqueeze(0),
                                            node.score + score,
                                            node.length + 1)
                        pq.put(nxt_node)

                now_nodes = []
                for _ in range(beam_size):
                    assert pq.qsize() > 0
                    node = pq.get()
                    all_nodes.append(node)
                    if node.idx == EOS_token or j == TARGET_LEN - 1:
                        end_pq.put(node)
                    else:
                        now_nodes.append(node)

            assert end_pq.qsize() > 0
            best_node = end_pq.get()

            predict = [best_node.idx.cpu().numpy()[0]]
            while best_node.prev is not None:
                best_node = best_node.prev
                predict.append(best_node.idx.cpu().numpy()[0])
            predict = predict[-2::-1]

            while len(predict) < TARGET_LEN:
                predict.append(PAD_token)

            return (i, np.array(predict))

        if beam_search:
            with parallel_backend('threading', n_jobs=-2):
                decoder_predict = Parallel()(delayed(bs)(i)
                                             for i in range(batch_size))
            decoder_predict = sorted(decoder_predict, key=lambda x: x[0])
            decoder_predict = [x[1] for x in decoder_predict]
            decoder_predict = np.stack(decoder_predict)
            return decoder_predict
        else:
            loss = 0
            decoder_predict = []
            for di in target_tensor:
                decoder_output, decoder_hidden = model(decoder_input,
                                                       decoder_hidden,
                                                       batch_size,
                                                       encoding=False,
                                                       enc_outputs=enc_outputs)
                loss += criterion(decoder_output, di.view(-1))
                topv, topi = decoder_output.data.topk(1)
                decoder_input = topi.detach().to(device)

                decoder_predict.append(topi.cpu().numpy())

            decoder_predict = np.hstack(decoder_predict)
            return loss.item() / target_length, decoder_predict
Beispiel #30
0
def paralelizeJobWhithDaskClient(function, client):
    c = client
    print(c)
    with joblib.parallel_backend('dask'):
        function()
Beispiel #31
0
import numpy as np
from joblib import parallel_backend
from sklearn.datasets import load_digits
from sklearn.model_selection import RandomizedSearchCV
from sklearn.svm import SVC

from ray.util.joblib import register_ray
register_ray()
param_space = {
    'C': np.logspace(-6, 6, 30),
    'gamma': np.logspace(-8, 8, 30),
    'tol': np.logspace(-4, -1, 30),
    'class_weight': [None, 'balanced'],
}

model = SVC(kernel='rbf')
search = RandomizedSearchCV(model, param_space, cv=5, n_iter=300, verbose=1)
digits = load_digits()

with parallel_backend('ray'):
    search.fit(digits.data, digits.target)
    def nuscenes_gt_to_kitti(
        self,
        lyft_dataroot: str = "/home/yw763/driving/lyft/v1.02-train",
        table_folder: str = "/home/yw763/driving/lyft/v1.02-train/v1.02-train",
        lidar_name: str = "LIDAR_TOP",
        get_all_detections: bool = True,
        parallel_n_jobs: int = 16,
        samples_count: Optional[int] = None,
    ) -> None:
        """Converts nuScenes GT formatted annotations to KITTI format.

        Args:
            lyft_dataroot: folder with tables (json files).
            table_folder: folder with tables (json files).
            lidar_name: Name of the lidar sensor.
                Only one lidar allowed at this moment.
            get_all_detections: If True, will write all
                bboxes in PointCloud and use only FrontCamera.
            parallel_n_jobs: Number of threads to parralel processing.
            samples_count: Number of samples to convert.

        """
        self.lyft_dataroot = lyft_dataroot
        self.table_folder = table_folder
        self.lidar_name = lidar_name
        self.get_all_detections = get_all_detections
        self.samples_count = samples_count
        self.parallel_n_jobs = parallel_n_jobs

        # Select subset of the data to look at.
        self.lyft_ds = LyftDataset(self.lyft_dataroot, self.table_folder)

        self.kitti_to_nu_lidar = Quaternion(axis=(0, 0, 1), angle=np.pi)
        self.kitti_to_nu_lidar_inv = self.kitti_to_nu_lidar.inverse

        # Get assignment of scenes to splits.
        split_logs = [
            self.lyft_ds.get("log", scene["log_token"])["logfile"]
            for scene in self.lyft_ds.scene
        ]
        if self.get_all_detections:
            self.cams_to_see = ["CAM_FRONT"]
        else:
            self.cams_to_see = [
                "CAM_FRONT",
                "CAM_FRONT_LEFT",
                "CAM_FRONT_RIGHT",
                "CAM_BACK",
                "CAM_BACK_LEFT",
                "CAM_BACK_RIGHT",
            ]

        # Create output folders.
        self.label_folder = self.store_dir.joinpath("label_2")
        self.calib_folder = self.store_dir.joinpath("calib")
        self.image_folder = self.store_dir.joinpath("image_2")
        self.lidar_folder = self.store_dir.joinpath("velodyne")
        for folder in [
                self.label_folder, self.calib_folder, self.image_folder,
                self.lidar_folder
        ]:
            if not folder.is_dir():
                folder.mkdir(parents=True)

        # Use only the samples from the current split.
        sample_tokens = self._split_to_samples(split_logs)
        if self.samples_count is not None:
            sample_tokens = sample_tokens[:self.samples_count]

        # print(len(sample_tokens))
        sample_tokens = sample_tokens

        self.tokens = sample_tokens

        with parallel_backend("threading", n_jobs=self.parallel_n_jobs):
            Parallel()(delayed(self.process_token_to_kitti)(sample_token)
                       for sample_token in tqdm(sample_tokens))
Beispiel #33
0
def function_multiprocessing(
    function: Callable,
    kwargs_list: List[Dict[str, Any]],
    recombined_epc: Union[Path, str],
    cluster,
    consolidate: bool = True,
) -> List[bool]:
    """Calls a function concurrently with the specfied arguments.

    A multiprocessing pool is used to call the function multiple times in parallel. Once
    all results are returned, they are combined into a single epc file.

    Args:
        function (Callable): the function to be called. Needs to return:

            - index (int): the index of the kwargs in the kwargs_list.
            - success (bool): whether the function call was successful, whatever that
                definiton is.
            - epc_file (Path/str): the epc file path where the objects are stored.
            - uuid_list (List[str]): list of UUIDs of relevant objects.

        kwargs_list (List[Dict[Any]]): A list of keyword argument dictionaries that are
            used when calling the function.
        recombined_epc (Path/str): A pathlib Path or path string of
            where the combined epc will be saved.
        cluster (LocalCluster/JobQueueCluster): a LocalCluster is a Dask cluster on a
            local machine. If using a job queing system, a JobQueueCluster can be used
            such as an SGECluster, SLURMCluster, PBSCluster, LSFCluster etc.
        consolidate (bool): if True and an equivalent part already exists in
            a model, it is not duplicated and the uuids are noted as equivalent.

    Returns:
        success_list (List[bool]): A boolean list of successful function calls.

    Note:
        This function uses the Dask backend to run the given function in parallel, so a
        Dask cluster must be setup and passed as an argument. Dask will need to be
        installed in the Python environment because it is not a dependency of the
        project. More info can be found at https://docs.dask.org/en/latest/deploying.html
    """
    log.info("Multiprocessing function called with %s function.", function.__name__)

    for i, kwargs in enumerate(kwargs_list):
        kwargs["index"] = i

    with parallel_backend("dask"):
        results = Parallel()(delayed(function)(**kwargs) for kwargs in kwargs_list)

    log.info("Function calls complete.")

    # Sorting the results by the original kwargs_list index.
    results = list(sorted(results, key = lambda x: x[0]))

    success_list = [result[1] for result in results]
    epc_list = [result[2] for result in results]
    uuids_list = [result[3] for result in results]
    log.info("Number of successes: %s/%s.", sum(success_list), len(results))

    epc_file = Path(str(recombined_epc))
    if epc_file.is_file():
        model_recombined = Model(epc_file = str(epc_file))
    else:
        model_recombined = new_model(epc_file = str(epc_file))

    log.info("Creating the recombined epc file.")
    for i, epc in enumerate(epc_list):
        if epc is None:
            continue
        while True:
            try:
                model = Model(epc_file = epc)
                break
            except FileNotFoundError:
                time.sleep(1)
                continue
        uuids = uuids_list[i]
        if uuids is None:
            uuids = model.uuids()
        for uuid in uuids:
            model_recombined.copy_uuid_from_other_model(model, uuid = uuid, consolidate = consolidate)

    # Deleting temporary directory.
    log.info("Deleting the temporary directory")
    rm_tree("tmp_dir")

    model_recombined.store_epc()

    log.info("Recombined epc file complete.")

    return success_list
        svc_c = trial.suggest_float("svc_c", 1e-10, 1e10, log=True)
        classifier_obj = sklearn.svm.SVC(C=svc_c, gamma="auto")
    else:
        rf_max_depth = trial.suggest_int("rf_max_depth", 2, 32, log=True)
        classifier_obj = sklearn.ensemble.RandomForestClassifier(
            max_depth=rf_max_depth, n_estimators=10
        )

    score = sklearn.model_selection.cross_val_score(classifier_obj, x, y, n_jobs=-1, cv=3)
    accuracy = score.mean()
    return accuracy


if __name__ == "__main__":
    study = optuna.create_study(direction="maximize")
    with joblib.parallel_backend("ray", n_jobs=-1):
        study.optimize(objective, n_trials=100)

    print(f"Number of finished trials: {len(study.trials)}")

    print(f"Elapsed time: {study.trials[-1].datetime_complete - study.trials[0].datetime_start}")

    print("Best trial:")
    trial = study.best_trial

    print(f"  Value: {trial.value}")

    print("  Params: ")
    for key, value in trial.params.items():
        print(f"    {key}: {value}")
# If you are on a UNIX system, it is possible to fallback to the old
# ``multiprocessing`` backend, which can pickle interactively defined functions
# with the default pickle module, which is faster for such large objects.
#

if sys.platform != 'win32':
    if IS_RUN_WITH_SPHINX_GALLERY:
        # When this example is run with sphinx gallery, it breaks the pickling
        # capacity for multiprocessing backend so we have to modify the way we
        # define our functions. This has nothing to do with the example.
        from utils import func_async
    else:
        def func_async(i, *args):
            return 2 * i

    with parallel_backend('multiprocessing'):
        t_start = time.time()
        Parallel(n_jobs=2)(
            delayed(func_async)(21, large_list) for _ in range(1))
        print("With multiprocessing backend and pickle serialization: {:.3f}s"
              .format(time.time() - t_start))


###############################################################################
# However, using ``fork`` to start new processes can cause violation of the
# POSIX specification and can have bad interaction with compiled extensions
# that use ``openmp``. Also, it is not possible to start processes with
# ``fork`` on windows where only ``spawn`` is available. The ``loky`` backend
# has been developped to mitigate these issues.
#
# To have fast pickling with ``loky``, it is possible to rely on ``pickle`` to