def computing_precise_vs_imprecise(in_path=None, ell_optimal=0.1, cv_n_fold=10, seeds=None, lib_path_server=None, model_type_precise='lda', model_type_imprecise='ilda', scaling=True): data = export_data_set('iris.data') if in_path is None else pd.read_csv( in_path) logger = create_logger("computing_precise_vs_imprecise", True) logger.info('Training dataset and models (%s, %s, %s, %s)', in_path, model_type_precise, model_type_imprecise, ell_optimal) X = data.iloc[:, :-1].values if scaling: X = normalize_minmax(X) y = np.array(data.iloc[:, -1].tolist()) seeds = generate_seeds(cv_n_fold) if seeds is None else seeds model_impr = __factory_model(model_type_imprecise, init_matlab=True, add_path_matlab=lib_path_server, DEBUG=False) model_prec = __factory_model_precise(model_type_precise, store_covariance=True) avg_imprecise, avg_precise, n_real_times = 0, 0, 0 for time in range(cv_n_fold): kf = KFold(n_splits=cv_n_fold, random_state=seeds[time], shuffle=True) imprecise_mean, precise_mean, n_real_fold = 0, 0, 0 for idx_train, idx_test in kf.split(y): X_cv_train, y_cv_train = X[idx_train], y[idx_train] X_cv_test, y_cv_test = X[idx_test], y[idx_test] model_impr.learn(X=X_cv_train, y=y_cv_train, ell=ell_optimal) model_prec.fit(X_cv_train, y_cv_train) n_real_tests, time_precise, time_imprecise = 0, 0, 0 n_test, _ = X_cv_test.shape for i, test in enumerate(X_cv_test): evaluate_imp, _ = model_impr.evaluate(test) evaluate = model_prec.predict([test]) if len(evaluate_imp) > 1: n_real_tests += 1 if y_cv_test[i] in evaluate_imp: time_imprecise += 1 if y_cv_test[i] in evaluate: time_precise += 1 logger.debug( "(time, iTest, ellOptimal, cautious, prediction, ground-truth)(%s, %s, %s, %s, %s, %s)", time, i, ell_optimal, evaluate_imp, evaluate, y_cv_test[i]) logger.debug( "(time, ellOptimal, nRealTests, timeImprecise, timePrecise) (%s, %s, %s, %s, %s)", time, ell_optimal, n_real_tests, time_imprecise, time_precise) if n_real_tests > 0: n_real_fold += 1 imprecise_mean += time_imprecise / n_real_tests precise_mean += time_precise / n_real_tests logger.debug("(time, nRealFold, imprecise, precise) (%s, %s, %s, %s)", time, n_real_fold, imprecise_mean, precise_mean) if n_real_fold > 0: n_real_times += 1 avg_imprecise += imprecise_mean / n_real_fold avg_precise += precise_mean / n_real_fold logger.debug("(dataset, models, imprec, prec) (%s, %s, %s, %s, %s)", in_path, model_type_imprecise, model_type_precise, avg_imprecise / n_real_times, avg_precise / n_real_times)
def performance_hold_out(in_path=None, out_path=None, model_type='lda', test_pct=0.4, n_times=10, seeds=None, scaling=False): assert os.path.exists(in_path), "Without training data, not testing" assert os.path.exists(out_path), "Without output saving performance" logger = create_logger("performance_hold_out", True) logger.info('Training data set %s, test percentage %s, model_type %s', in_path, test_pct, model_type) data = pd.read_csv(in_path, header=None) X = data.iloc[:, :-1].values if scaling: X = normalize_minmax(X) y = data.iloc[:, -1].tolist() seeds = generate_seeds(n_times) if seeds is None else seeds logger.info('Seeds generated %s', seeds) file_csv = open(out_path, 'w') writer = csv.writer(file_csv) model = __factory_model_precise(model_type, store_covariance=True) mean_u65, mean_u80 = np.array([]), np.array([]) for i in range(0, n_times): X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=test_pct, random_state=seeds[i]) sum_u65, sum_u80 = 0, 0 model.fit(X_train, y_train) n, _ = X_test.shape for j, test in enumerate(X_test): evaluate = model.predict([test]) if y_test[j] in evaluate: sum_u65 += u65(evaluate) sum_u80 += u80(evaluate) logger.info("time, u65, u80 (%s, %s, %s)", i, sum_u65 / n, sum_u80 / n) mean_u65 = np.append(mean_u65, sum_u65 / n) mean_u80 = np.append(mean_u80, sum_u80 / n) writer.writerow([-999, i, mean_u65[i], mean_u80[i]]) file_csv.flush() file_csv.close() logger.info("[total:data-set:avgResults] (%s, %s)", np.mean(mean_u65), np.mean(mean_u80))
def performance_accuracy_hold_out(in_path=None, model_type="ilda", ell_optimal=0.1, lib_path_server=None, seeds=None, DEBUG=False, scaling=False): assert os.path.exists( in_path ), "Without training data, cannot performing cross hold-out accuracy" logger = create_logger("performance_accuracy_hold_out", True) logger.info('Training dataset (%s, %s, %s)', in_path, model_type, ell_optimal) X, y = dataset_to_Xy(in_path, scaling=scaling) seeds = generate_seeds(cv_n_fold) if seeds is None else seeds logger.info('Seeds used for accuracy %s', seeds) n_time = len(seeds) mean_u65, mean_u80 = 0, 0 model = __factory_model(model_type, solver_matlab=True, add_path_matlab=lib_path_server, DEBUG=DEBUG) for k in range(0, n_time): X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.4, random_state=seeds[k]) model.learn(X=X_cv_train, y=y_cv_train, ell=ell_optimal) sum_u65, sum_u80 = 0, 0 n_test, _ = X_test.shape for i, test in enumerate(X_test): evaluate = lqa.evaluate(test) logger.debug( "(testing, ell_current, prediction, ground-truth) (%s, %s, %s, %s)", i, ell_optimal, evaluate, y_test[i]) if y_test[i] in evaluate: sum_u65 += u65(evaluate) sum_u80 += u80(evaluate) logger.debug("Partial-kfold (%s, %s, %s, %s)", ell_current, k, sum_u65 / n_test, sum_u80 / n_test) mean_u65 += sum_u65 / n_test mean_u80 += sum_u80 / n_test mean_u65 = mean_u65 / n_time mean_u80 = mean_u80 / n_time logger.debug("Total-ell (%s, %s, %s, %s)", in_path, ell_optimal, mean_u65, mean_u80)
def performance_cv_accuracy(in_path=None, model_type='lda', cv_n_fold=10, seeds=None, scaling=False): assert os.path.exists(in_path), "Without training data, not testing" data = pd.read_csv(in_path, header=None) logger = create_logger("performance_cv_accuracy", True) logger.info('Training data set %s, cv_n_fold %s, model_type %s', in_path, cv_n_fold, model_type) X = data.iloc[:, :-1].values if scaling: X = normalize_minmax(X) y = np.array(data.iloc[:, -1].tolist()) avg_u65, avg_u80 = 0, 0 seeds = generate_seeds(cv_n_fold) if seeds is None else seeds logger.info('Seeds generated %s', seeds) for time in range(cv_n_fold): # Generation a random k-fold validation. kf = KFold(n_splits=cv_n_fold, random_state=seeds[time], shuffle=True) model = __factory_model_precise(model_type, store_covariance=True) mean_u65, mean_u80 = 0, 0 for idx_train, idx_test in kf.split(y): X_cv_train, y_cv_train = X[idx_train], y[idx_train] X_cv_test, y_cv_test = X[idx_test], y[idx_test] model.fit(X_cv_train, y_cv_train) n_test = len(idx_test) sum_u65, sum_u80 = 0, 0 for i, test in enumerate(X_cv_test): evaluate = model.predict([test]) logger.debug( "(testing, prediction, ground-truth) (%s, %s, %s)", i, evaluate, y_cv_test[i]) if y_cv_test[i] in evaluate: sum_u65 += u65(evaluate) sum_u80 += u80(evaluate) mean_u65 += sum_u65 / n_test mean_u80 += sum_u80 / n_test logger.info("Time, seed, u65, u80 (%s, %s, %s, %s)", time, seeds[time], mean_u65 / cv_n_fold, mean_u80 / cv_n_fold) avg_u65 += mean_u65 / cv_n_fold avg_u80 += mean_u80 / cv_n_fold logger.info("[Total:data-set:avgResults] (%s, %s, %s)", in_path, avg_u65 / cv_n_fold, avg_u80 / cv_n_fold)
def performance_cv_accuracy_imprecise(in_path=None, model_type="ilda", ell_optimal=0.1, nb_process=2, lib_path_server=None, cv_n_fold=10, seeds=None, criterion="maximality"): assert os.path.exists(in_path), "Without training data, not testing" data = pd.read_csv(in_path) logger = create_logger("performance_cv_accuracy_imprecise", True) logger.info('Training dataset (%s, %s, %s, %s)', in_path, model_type, ell_optimal, criterion) X = data.iloc[:, :-1].values y = np.array(data.iloc[:, -1].tolist()) avg_u65, avg_u80 = 0, 0 seeds = generate_seeds(cv_n_fold) if seeds is None else seeds logger.info('Seeds used for accuracy %s', seeds) manager = ManagerWorkers(nb_process=nb_process, criterion=criterion) manager.executeAsync(model_type, lib_path_server) for time in range(cv_n_fold): kf = KFold(n_splits=cv_n_fold, random_state=seeds[time], shuffle=True) mean_u65, mean_u80 = 0, 0 for idx_train, idx_test in kf.split(y): logger.info("Splits train %s", idx_train) logger.info("Splits test %s", idx_test) X_cv_train, y_cv_train = X[idx_train], y[idx_train] X_cv_test, y_cv_test = X[idx_test], y[idx_test] mean_u65, mean_u80 = computing_training_testing_step( X_cv_train, y_cv_train, X_cv_test, y_cv_test, ell_optimal, manager, mean_u65, mean_u80) logger.debug("Partial-kfold (%s, %s, %s, %s)", ell_optimal, time, mean_u65, mean_u80) logger.info("Time, seed, u65, u80 (%s, %s, %s, %s)", time, seeds[time], mean_u65 / cv_n_fold, mean_u80 / cv_n_fold) avg_u65 += mean_u65 / cv_n_fold avg_u80 += mean_u80 / cv_n_fold manager.poisonPillTraining() logger.debug("total-ell (%s, %s, %s, %s)", in_path, ell_optimal, avg_u65 / cv_n_fold, avg_u80 / cv_n_fold)
def performance_cv_accuracy_imprecise(in_path=None, model_type="ilda", ell_optimal=0.1, scaling=False, lib_path_server=None, cv_n_fold=10, seeds=None, nb_process=10): assert os.path.exists( in_path ), "Without training data, cannot performing cross validation accuracy" logger = create_logger("performance_cv_accuracy_imprecise", True) logger.info('Training dataset (%s, %s, %s)', in_path, model_type, ell_optimal) X, y = dataset_to_Xy(in_path, scaling=scaling) avg_u65, avg_u80 = 0, 0 seeds = generate_seeds(cv_n_fold) if seeds is None else seeds logger.info('Seeds used for accuracy %s', seeds) manager = ManagerWorkers(nb_process=nb_process) manager.executeAsync(model_type, lib_path_server) for time in range(cv_n_fold): kf = KFold(n_splits=cv_n_fold, random_state=seeds[time], shuffle=True) mean_u65, mean_u80 = 0, 0 for idx_train, idx_test in kf.split(y): mean_u65, mean_u80, _ = computing_training_testing_step( X[idx_train], y[idx_train], X[idx_test], y[idx_test], ell_optimal, manager, mean_u65, mean_u80) logger.debug("Partial-kfold (%s, %s, %s, %s)", ell_optimal, time, mean_u65, mean_u80) logger.info("Time, seed, u65, u80 (%s, %s, %s, %s)", time, seeds[time], mean_u65 / cv_n_fold, mean_u80 / cv_n_fold) avg_u65 += mean_u65 / cv_n_fold avg_u80 += mean_u80 / cv_n_fold manager.poisonPillTraining() logger.debug("Total-ell (%s, %s, %s, %s)", in_path, ell_optimal, avg_u65 / cv_n_fold, avg_u80 / cv_n_fold)
def computing_time_prediction(in_path=None, ell_optimal=0.1, lib_path_server=None, model_type="ilda", criterion="maximality", k_repetition=10, seeds=None): assert os.path.exists(in_path), "Without training data, not testing" data = pd.read_csv(in_path, header=None) logger = create_logger("computing_time_prediction", True) X = data.iloc[:, :-1].values y = data.iloc[:, -1].tolist() seeds = generate_seeds(k_repetition) if seeds is None else seeds logger.info( 'Training dataset %s with maximality version (%s) and model (%s), ell_optimal (%s) and seeds %s', in_path, criterion, model_type, ell_optimal, seeds) model = __factory_model(model_type, solver_matlab=True, add_path_matlab=lib_path_server, DEBUG=False) avg = np.array([]) for k in range(k_repetition): logger.info("%s-fold repetition randomly, seed %s", k, seeds[k]) X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=seeds[k]) model.learn(X=X_train, y=y_train, ell=ell_optimal) n, _ = X_test.shape sum_time = 0 for i, test in enumerate(X_test): start = time.time() evaluate = model.evaluate(test, criterion=criterion) end = time.time() logger.info("Evaluate %s, Ground-truth %s, Time %s ", evaluate, y_test[i], (end - start)) sum_time += (end - start) avg = np.append(avg, sum_time / n) logger.info("Total time (%s, %s) and average %s and sd %s of %s testing", in_path, avg, np.mean(avg), np.std(avg), n)
def performance_qda_regularized(in_path=None, out_path=None, cv_n_fold=10, seeds=None, from_alpha=0, to_alpha=2.0, by_alpha=0.01, scaling=False): assert os.path.exists(in_path), "Without training data, not testing" assert os.path.exists(out_path), "Without output saving performance" data = pd.read_csv(in_path, header=None) logger = create_logger("performance_qda_regularized", True) logger.info('Training data set %s, cv_n_fold %s, model_type %s', in_path, cv_n_fold, "qda") X = data.iloc[:, :-1].values if scaling: X = normalize_minmax(X) y = np.array(data.iloc[:, -1].tolist()) seeds = generate_seeds(cv_n_fold) if seeds is None else seeds logger.info('Seeds generated %s', seeds) file_csv = open(out_path, 'a') writer = csv.writer(file_csv) alphas = np.arange(from_alpha, to_alpha, by_alpha) writer.writerow(alphas) qda_regularized = [None] * len(alphas) for idx, alpha in enumerate(alphas): qda_regularized[idx] = __factory_model_precise("qda", store_covariance=True, reg_param=alpha) # Generation a random k-fold validation. kf_second = KFold(n_splits=cv_n_fold, random_state=None, shuffle=True) ikfold, accuracy, best_alphas = 0, [0] * cv_n_fold, [0] * cv_n_fold for idx_learning, idx_testing in kf_second.split(y): X_training, y_training = X[idx_learning], y[idx_learning] X_testing, y_testing = X[idx_testing], y[idx_testing] kf = KFold(n_splits=cv_n_fold, random_state=seeds[ikfold], shuffle=True) acc_u80 = [0] * len(qda_regularized) for idx_train, idx_test in kf.split(y_training): X_cv_train, y_cv_train = X_training[idx_train], y_training[ idx_train] X_cv_test, y_cv_test = X_training[idx_test], y_training[idx_test] for model in qda_regularized: model.fit(X_cv_train, y_cv_train) n_test = len(idx_test) for i, test in enumerate(X_cv_test): for im, model in enumerate(qda_regularized): evaluate = model.predict([test]) if y_cv_test[i] in evaluate: acc_u80[im] += (u80(evaluate) / n_test) / cv_n_fold idx_best = np.argmax(acc_u80) logger.info("[1kfold:best_model:seed:u80] (%s, %s, %s, %s)", ikfold, alphas[idx_best], seeds[ikfold], acc_u80) writer.writerow(acc_u80) file_csv.flush() best_model = __factory_model_precise("qda", store_covariance=True, reg_param=alphas[idx_best]) best_model.fit(X_training, y_training) accuracy[ikfold], bn_test, best_alphas[ikfold] = 0, len( idx_testing), alphas[idx_best] for i, test in enumerate(X_testing): evaluate = best_model.predict([test]) if y_testing[i] in evaluate: accuracy[ikfold] += u80(evaluate) / bn_test logger.info("[2kfold:best_model:seed:accuracy] (%s, %s, %s)", ikfold, alphas[idx_best], accuracy[ikfold]) ikfold += 1 file_csv.close() logger.info("[total:data-set:avgResults] (%s, %s, %s, %s)", in_path, np.mean(accuracy), best_alphas, accuracy)
def computing_best_imprecise_mean(in_path=None, out_path=None, lib_path_server=None, model_type="ilda", from_ell=0.1, to_ell=1.0, by_ell=0.1, seed=None, cv_kfold_first=10, nb_process=2, skip_nfold=0, cv_kfold_second=10, seed_second=None, scaling=False): assert os.path.exists(in_path), "Without training data, not testing" assert os.path.exists(out_path), "File for putting results does not exist" logger = create_logger("computing_best_imprecise_mean_cv", True) logger.info('Training dataset (%s, %s, %s)', in_path, out_path, model_type) logger.info('Parameters (ells, nbProcess, skip_nfold, cv_kfold_second) (%s, %s, %s, %s, %s, %s)', from_ell, to_ell, by_ell, nb_process, skip_nfold, cv_kfold_second) data = pd.read_csv(in_path, header=None) X = data.iloc[:, :-1].values if scaling: X = normalize_minmax(X) y = np.array(data.iloc[:, -1].tolist()) # Seeding a random value for k-fold top learning-testing data seed = random.randrange(pow(2, 30)) if seed is None else seed logger.debug("[FIRST-STEP-SEED] MODEL: %s, SEED: %s", model_type, seed) # Create a CSV file for saving results file_csv = open(out_path, 'a') writer = csv.writer(file_csv) manager = ManagerWorkers(nb_process=nb_process) manager.executeAsync(model_type, lib_path_server) kfFirst = KFold(n_splits=cv_kfold_first, random_state=seed, shuffle=True) acc_u80, acc_u65, idx_kfold = dict(), dict(), 0 seed_2step = generate_seeds(cv_kfold_second) if seed_second is None else seed_second logger.debug("[SECOND-STEP-SEEDS] MODEL: %s, SEED: %s, SECOND-SEED: %s", model_type, seed, seed_2step) for idx_learning, idx_testing in kfFirst.split(y): ell_u65, ell_u80 = dict(), dict() # Generate sampling k-fold (learning, testing) for optimal ell parameters X_learning, y_learning = X[idx_learning], y[idx_learning] X_testing, y_testing = X[idx_testing], y[idx_testing] logger.info("Splits %s learning %s", idx_kfold, idx_learning) logger.info("Splits %s testing %s", idx_kfold, idx_testing) # # n-Skipping sampling and reboot parameter from_ell to 0.01 next sampling if skip_nfold != 0 and idx_kfold > skip_nfold: from_ell = 0.01 # n-Skipping fold cross-validation (purpose for parallel computing) if idx_kfold >= skip_nfold: # Generate same k-fold-second (train, test) for impartially computing accuracy all ell parameters splits_ell = list([]) logger.debug("[2-STEP-SEED] MODEL: %s, SEED: %s OF FIRST STEP %s", model_type, seed_2step[idx_kfold], seed) kfSecond = KFold(n_splits=cv_kfold_second, random_state=seed_2step[idx_kfold], shuffle=True) for idx_learn_train, idx_learn_test in kfSecond.split(y_learning): splits_ell.append((idx_learn_train, idx_learn_test)) logger.info("Splits %s train %s", len(splits_ell), idx_learn_train) logger.info("Splits %s test %s", len(splits_ell), idx_learn_test) for ell_current in np.arange(from_ell, to_ell, by_ell): ell_u65[ell_current], ell_u80[ell_current] = 0, 0 logger.info("ELL_CURRENT %s", ell_current) for idx_learn_train, idx_learn_test in splits_ell: logger.info("Splits step train %s", idx_learn_train) logger.info("Splits step test %s", idx_learn_test) X_cv_train, y_cv_train = X_learning[idx_learn_train], y_learning[idx_learn_train] X_cv_test, y_cv_test = X_learning[idx_learn_test], y_learning[idx_learn_test] ell_u65[ell_current], ell_u80[ell_current], _ = \ computing_training_testing_step(X_cv_train, y_cv_train, X_cv_test, y_cv_test, ell_current, manager, ell_u65[ell_current], ell_u80[ell_current]) logger.info("Partial-kfold (%s, %s, %s)", ell_current, ell_u65[ell_current], ell_u80[ell_current]) ell_u65[ell_current] = ell_u65[ell_current] / cv_kfold_first ell_u80[ell_current] = ell_u80[ell_current] / cv_kfold_first writer.writerow([ell_current, idx_kfold, ell_u65[ell_current], ell_u80[ell_current]]) file_csv.flush() logger.debug("Partial-ell-k-step (%s, %s, %s)", idx_kfold, ell_u65[ell_current], ell_u80[ell_current]) logger.debug("Total-ell-k-step (%s, %s, %s, %s)", in_path, idx_kfold, ell_u65, ell_u80) # Computing optimal ells for using in testing step acc_ell_u80 = max(ell_u80.values()) acc_ell_u65 = max(ell_u65.values()) ell_u80_opts = [k for k, v in ell_u80.items() if v == acc_ell_u80] ell_u65_opts = [k for k, v in ell_u65.items() if v == acc_ell_u65] acc_u65[idx_kfold], acc_u80[idx_kfold] = 0, 0 n_ell80_opts, n_ell65_opts = len(ell_u80_opts), len(ell_u65_opts) for ell_u80_opt in ell_u80_opts: logger.info("ELL_OPTIMAL_CV_U80 %s", ell_u80_opt) _, _acc_u80, _ = \ computing_training_testing_step(X_learning, y_learning, X_testing, y_testing, ell_u80_opt, manager, 0, 0) acc_u80[idx_kfold] += _acc_u80 writer.writerow([-999, -8, ell_u80_opt, _acc_u80]) for ell_u65_opt in ell_u65_opts: logger.info("ELL_OPTIMAL_CV_U65 %s", ell_u65_opt) _acc_u65, _, _ = \ computing_training_testing_step(X_learning, y_learning, X_testing, y_testing, ell_u65_opt, manager, 0, 0) acc_u65[idx_kfold] += _acc_u65 writer.writerow([-999, -7, ell_u65_opt, _acc_u65]) acc_u65[idx_kfold] = acc_u65[idx_kfold] / n_ell65_opts acc_u80[idx_kfold] = acc_u80[idx_kfold] / n_ell80_opts writer.writerow([-999, idx_kfold, acc_u65[idx_kfold], acc_u80[idx_kfold]]) file_csv.flush() logger.debug("Partial-ell-2step (u80, u65, accs) (%s, %s, %s, %s, %s)", -999, ell_u80_opts, ell_u65_opts, acc_u65[idx_kfold], acc_u80[idx_kfold]) idx_kfold += 1 writer.writerow([-9999, -9, np.mean(list(acc_u65.values())), np.mean(list(acc_u80.values()))]) manager.poisonPillTraining() file_csv.close() logger.debug("Total-accuracy (%s, %s, %s)", in_path, acc_u65, acc_u80) logger.debug("Total-avg-accuracy (%s, %s, %s)", in_path, np.mean(list(acc_u65.values())), np.mean(list(acc_u80.values())))
def computing_best_imprecise_mean(in_path=None, out_path=None, cv_nfold=10, model_type="ilda", test_size=0.4, from_ell=0.1, to_ell=1.0, by_ell=0.1, seeds=None, lib_path_server=None, nb_process=2, n_sampling=10, skip_n_sample=0, criterion="maximality", scaling=False): assert os.path.exists(in_path), "Without training data, not testing" assert os.path.exists(out_path), "File for putting results does not exist" logger = create_logger("computing_best_imprecise_mean_sampling", True) logger.info('Training dataset (%s, %s, %s)', in_path, model_type, criterion) logger.info( 'Parameters (size, ells, nbProcess, sampling, nSkip) (%s, %s, %s, %s, %s, %s, %s)', test_size, from_ell, to_ell, by_ell, nb_process, n_sampling, skip_n_sample) data = pd.read_csv(in_path, header=None) X = data.iloc[:, :-1].values if scaling: X = normalize_minmax(X) y = np.array(data.iloc[:, -1].tolist()) # Seed for get back up if process is killed seeds = generate_seeds(n_sampling) if seeds is None else seeds logger.debug("MODEL: %s, SEED: %s", model_type, seeds) # Create a CSV file for saving results file_csv = open(out_path, 'a') writer = csv.writer(file_csv) manager = ManagerWorkers(nb_process=nb_process, criterion=criterion) manager.executeAsync(model_type, lib_path_server) acc_u80, acc_u65 = dict(), dict() for sampling in range(min(n_sampling, len(seeds))): X_learning, X_testing, y_learning, y_testing = \ train_test_split(X, y, test_size=test_size, random_state=seeds[sampling]) logger.info("Splits %s learning %s", sampling, y_learning) logger.info("Splits %s testing %s", sampling, y_testing) # n-Skipping sampling and reboot parameter from_ell to 0.01 next sampling if skip_n_sample != 0 and sampling > skip_n_sample: from_ell = 0.01 # n-Skipping sampling testing (purpose for parallel computing) if sampling >= skip_n_sample: kf = KFold(n_splits=cv_nfold, random_state=None, shuffle=True) ell_u65, ell_u80, splits = dict(), dict(), list([]) for idx_train, idx_test in kf.split(y_learning): splits.append((idx_train, idx_test)) logger.info("Sampling %s Splits %s train %s", sampling, len(splits), idx_train) logger.info("Sampling %s Splits %s test %s", sampling, len(splits), idx_test) for ell_current in np.arange(from_ell, to_ell, by_ell): ell_u65[ell_current], ell_u80[ell_current] = 0, 0 logger.info("ELL_CURRENT %s", ell_current) for idx_train, idx_test in splits: logger.info("Splits train %s", idx_train) logger.info("Splits test %s", idx_test) X_cv_train, y_cv_train = X_learning[idx_train], y_learning[ idx_train] X_cv_test, y_cv_test = X_learning[idx_test], y_learning[ idx_test] # Computing accuracy testing for cross-validation step ell_u65[ell_current], ell_u80[ell_current] = \ computing_training_testing_step(X_cv_train, y_cv_train, X_cv_test, y_cv_test, ell_current, manager, ell_u65[ell_current], ell_u80[ell_current]) logger.info("Partial-kfold (%s, %s, %s)", ell_current, ell_u65[ell_current], ell_u80[ell_current]) ell_u65[ell_current] = ell_u65[ell_current] / cv_nfold ell_u80[ell_current] = ell_u80[ell_current] / cv_nfold writer.writerow([ ell_current, sampling, ell_u65[ell_current], ell_u80[ell_current] ]) file_csv.flush() logger.debug("Partial-ell-sampling (%s, %s, %s, %s)", ell_current, sampling, ell_u65, ell_u80) logger.debug("Total-ell-sampling (%s, %s, %s, %s)", in_path, sampling, ell_u65, ell_u80) # Computing optimal ells for using in testing step acc_ellu80 = max(ell_u80.values()) acc_ellu65 = max(ell_u65.values()) ellu80_opts = [k for k, v in ell_u80.items() if v == acc_ellu80] ellu65_opts = [k for k, v in ell_u65.items() if v == acc_ellu65] acc_u65[sampling], acc_u80[sampling] = 0, 0 n_ell80_opts, n_ell65_opts = len(ellu80_opts), len(ellu65_opts) for ellu80_opt in ellu80_opts: logger.info("ELL_OPTIMAL_SAMPLING_U80 %s", ellu80_opt) _, acc_u80[sampling] = \ computing_training_testing_step(X_learning, y_learning, X_testing, y_testing, ellu80_opt, manager, 0, acc_u80[sampling]) for ellu65_opt in ellu65_opts: logger.info("ELL_OPTIMAL_SAMPLING_U65 %s", ellu65_opt) acc_u65[sampling], _ = \ computing_training_testing_step(X_learning, y_learning, X_testing, y_testing, ellu65_opt, manager, acc_u65[sampling], 0) acc_u65[sampling] = acc_u65[sampling] / n_ell65_opts acc_u80[sampling] = acc_u80[sampling] / n_ell80_opts writer.writerow( [-999, sampling, acc_u65[sampling], acc_u80[sampling]]) file_csv.flush() logger.debug("Partial-ell-2step (%s, %s, %s, %s)", -999, ellu80_opts, acc_u65[sampling], acc_u80[sampling]) writer.writerow([ -9999, -9, np.mean(list(acc_u65.values())), np.mean(list(acc_u80.values())) ]) manager.poisonPillTraining() file_csv.close() logger.debug("Total-accuracy (%s, %s, %s)", in_path, acc_u65, acc_u80) logger.debug("Total-avg-accuracy (%s, %s, %s)", in_path, np.mean(list(acc_u65.values())), np.mean(list(acc_u80.values())))