def computing_precise_vs_imprecise(in_path=None, ell_optimal=0.1, cv_n_fold=10, seeds=None, lib_path_server=None, model_type_precise='lda', model_type_imprecise='ilda', scaling=True): data = export_data_set('iris.data') if in_path is None else pd.read_csv( in_path) logger = create_logger("computing_precise_vs_imprecise", True) logger.info('Training dataset and models (%s, %s, %s, %s)', in_path, model_type_precise, model_type_imprecise, ell_optimal) X = data.iloc[:, :-1].values if scaling: X = normalize_minmax(X) y = np.array(data.iloc[:, -1].tolist()) seeds = generate_seeds(cv_n_fold) if seeds is None else seeds model_impr = __factory_model(model_type_imprecise, init_matlab=True, add_path_matlab=lib_path_server, DEBUG=False) model_prec = __factory_model_precise(model_type_precise, store_covariance=True) avg_imprecise, avg_precise, n_real_times = 0, 0, 0 for time in range(cv_n_fold): kf = KFold(n_splits=cv_n_fold, random_state=seeds[time], shuffle=True) imprecise_mean, precise_mean, n_real_fold = 0, 0, 0 for idx_train, idx_test in kf.split(y): X_cv_train, y_cv_train = X[idx_train], y[idx_train] X_cv_test, y_cv_test = X[idx_test], y[idx_test] model_impr.learn(X=X_cv_train, y=y_cv_train, ell=ell_optimal) model_prec.fit(X_cv_train, y_cv_train) n_real_tests, time_precise, time_imprecise = 0, 0, 0 n_test, _ = X_cv_test.shape for i, test in enumerate(X_cv_test): evaluate_imp, _ = model_impr.evaluate(test) evaluate = model_prec.predict([test]) if len(evaluate_imp) > 1: n_real_tests += 1 if y_cv_test[i] in evaluate_imp: time_imprecise += 1 if y_cv_test[i] in evaluate: time_precise += 1 logger.debug( "(time, iTest, ellOptimal, cautious, prediction, ground-truth)(%s, %s, %s, %s, %s, %s)", time, i, ell_optimal, evaluate_imp, evaluate, y_cv_test[i]) logger.debug( "(time, ellOptimal, nRealTests, timeImprecise, timePrecise) (%s, %s, %s, %s, %s)", time, ell_optimal, n_real_tests, time_imprecise, time_precise) if n_real_tests > 0: n_real_fold += 1 imprecise_mean += time_imprecise / n_real_tests precise_mean += time_precise / n_real_tests logger.debug("(time, nRealFold, imprecise, precise) (%s, %s, %s, %s)", time, n_real_fold, imprecise_mean, precise_mean) if n_real_fold > 0: n_real_times += 1 avg_imprecise += imprecise_mean / n_real_fold avg_precise += precise_mean / n_real_fold logger.debug("(dataset, models, imprec, prec) (%s, %s, %s, %s, %s)", in_path, model_type_imprecise, model_type_precise, avg_imprecise / n_real_times, avg_precise / n_real_times)
def __init__(self, solver_matlab=False, gda_method="nda", add_path_matlab=None, DEBUG=False): """ :param solver_matlab: If it is true: it create a only classifier to handle m-binary classifier (exact solver matlab) false: it create a classifier by binary classifier (approximation solver python) :param gda_method: inda, ieda, ilda, iqda :param add_path_matlab: :param DEBUG: """ super(IGDA_BR, self).__init__(DEBUG) self.gda_models = None self.nb_feature = None self.__solver_matlab = solver_matlab self.__igda_name = "i" + gda_method self.__gda_name = gda_method self._logger = create_logger("IGDA_BR", DEBUG) if self.__solver_matlab: self._global_gda_imprecise = _factory_igda_model( model_type=self.__igda_name, solver_matlab=True, add_path_matlab=add_path_matlab, DEBUG=DEBUG)
def computing_outer_vs_exact_inference_random_tree(out_path, nb_labels=3, nb_repeats=100, nb_process=1, seed=None, min_epsilon_param=0.05, max_epsilon_param=0.5, step_epsilon_param=0.05): assert os.path.exists(out_path), "File for putting results does not exist" logger = create_logger("computing_outer_vs_exact_inference_random_tree", True) logger.info('Results file (%s)', out_path) logger.info("(nb_repeats, nb_process, nb_labels) (%s, %s, %s)", nb_repeats, nb_process, nb_labels) logger.info( "(min_epsilon_param, max_epsilon_param, step_epsilon_param) (%s, %s, %s)", min_epsilon_param, max_epsilon_param, step_epsilon_param) if seed is None: seed = random.randrange(pow(2, 20)) random.seed(seed) logger.debug("[FIRST-STEP-SEED] SEED: %s", seed) # Create a CSV file for saving results file_csv = open(out_path, 'a') writer = csv.writer(file_csv) # Create a CSV file for saving time prediction out_path_partial = out_path[:-4] + "_time.csv" if not os.path.exists(out_path_partial): with open(out_path_partial, 'w'): pass f_time_csv = open(out_path_partial, 'a') writer_time = csv.writer(f_time_csv) POOL = multiprocessing.Pool(processes=nb_process) for epsilon in np.arange(min_epsilon_param, max_epsilon_param, step_epsilon_param): target_function = partial(parallel_inferences, nb_labels=nb_labels, epsilon=epsilon) set_distance_cardinal = POOL.map(target_function, range(nb_repeats)) set_distance_cardinal = np.array(set_distance_cardinal) # writing distance outer vs exact procedure writer.writerow(np.hstack((epsilon, set_distance_cardinal[:, 0]))) file_csv.flush() logger.info("Partial-s-k_step (%s, %s)", str(epsilon), sum(set_distance_cardinal[:, 0]) / nb_repeats) # writing time naive vs exact procedure writer_time.writerow( np.hstack((epsilon, "exact", set_distance_cardinal[:, 1]))) writer_time.writerow( np.hstack((epsilon, "naive", set_distance_cardinal[:, 2]))) f_time_csv.flush() logger.info("Partial-avg-time (%s, %s)", str(epsilon), np.mean(set_distance_cardinal[:, 1:3], axis=0)) file_csv.close() logger.info("Results Final")
def performance_accuracy_noise_corrupted_test_data(in_train_paths=None, in_tests_paths=None, model_type_precise='lda', model_type_imprecise='ilda', ell_optimal=0.1, scaling=False, lib_path_server=None, nb_process=10): assert isinstance(in_train_paths, list), "Without training data, cannot create to model" assert isinstance( in_tests_paths, list), "Without training data, cannot performing accuracy" logger = create_logger("performance_accuracy_noise_corrupted_test_data", True) logger.info('Training dataset (%s, %s, %s)', in_train_paths, model_type_imprecise, ell_optimal) manager = ManagerWorkers(nb_process=nb_process) manager.executeAsync(model_type_imprecise, lib_path_server) versus = model_type_imprecise + "_vs_" + model_type_precise file_csv = open("results_" + versus + "_noise_accuracy.csv", 'w') writer = csv.writer(file_csv) model_precise = __factory_model_precise(model_type_precise, store_covariance=True) for in_train_path in in_train_paths: X_train, y_train = dataset_to_Xy(in_train_path, scaling=scaling) model_precise.fit(X_train, y_train) accuracies = dict({}) for in_test_path in in_tests_paths: X_test, y_test = dataset_to_Xy(in_test_path, scaling=scaling) _u65, _u80, _set = computing_training_testing_step( X_train, y_train, X_test, y_test, ell_optimal, manager, 0, 0, 0) evaluate = model_precise.predict(X_test) _acc = sum( 1 for k, j in zip(evaluate, y_test) if k == j) / len(y_test) logger.debug("accuracy-in_test_path (%s, %s, %s, %s, %s, %s)", ntpath.basename(in_train_path), ntpath.basename(in_test_path), ell_optimal, _u65, _u80, _acc) accuracies[ntpath.basename(in_test_path)] = [ ell_optimal, _u65, _u80, _set, _acc ] writer.writerow([ ntpath.basename(in_train_path), ntpath.basename(in_test_path), ell_optimal, _u65, _u80, _set, _acc ]) file_csv.flush() logger.debug("Partial-finish-accuracy-noise-corrupted_test %s: %s", ntpath.basename(in_train_path), accuracies) manager.poisonPillTraining() file_csv.close() logger.debug("Finish-accuracy-noise-corrupted_test")
def computing_best_imprecise_mean(in_path=None, out_path=None, cv_nfold=10, model_type="ieda", test_size=0.4, from_ell=0.1, to_ell=1.0, by_ell=0.1, seed=None, lib_path_server=None, scaling=False): assert os.path.exists(in_path), "Without training data, not testing" assert os.path.exists(out_path), "File for putting results does not exist" logger = create_logger("computing_best_imprecise_mean", True) logger.info('Training dataset %s', in_path) data = pd.read_csv(in_path) # , header=None) X = data.iloc[:, :-1].values if scaling: X = normalize_minmax(X) y = np.array(data.iloc[:, -1].tolist()) ell_u65, ell_u80 = dict(), dict() seed = random.randrange(pow(2, 30)) if seed is None else seed logger.debug("MODEL: %s, SEED: %s", model_type, seed) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=seed) kf = KFold(n_splits=cv_nfold, random_state=None, shuffle=True) splits = list([]) for idx_train, idx_test in kf.split(y_train): splits.append((idx_train, idx_test)) logger.info("Splits %s train %s", len(splits), idx_train) logger.info("Splits %s test %s", len(splits), idx_test) # Create a CSV file for saving results file_csv = open(out_path, 'a') writer = csv.writer(file_csv) model = __factory_model(model_type, solver_matlab=True, add_path_matlab=lib_path_server, DEBUG=True) for ell_current in np.arange(from_ell, to_ell, by_ell): ell_u65[ell_current], ell_u80[ell_current] = 0, 0 logger.info("ELL_CURRENT %s", ell_current) for idx_train, idx_test in splits: logger.info("Splits train %s", idx_train) logger.info("Splits test %s", idx_test) X_cv_train, y_cv_train = X_train[idx_train], y_train[idx_train] X_cv_test, y_cv_test = X_train[idx_test], y_train[idx_test] model.learn(X=X_cv_train, y=y_cv_train, ell=ell_current) sum_u65, sum_u80 = 0, 0 n_test = len(idx_test) for i, test in enumerate(X_cv_test): evaluate = model.evaluate(test) logger.debug("(testing, ell_current, prediction, ground-truth) (%s, %s, %s, %s)", i, ell_current, evaluate, y_cv_test[i]) if y_cv_test[i] in evaluate: sum_u65 += u65(evaluate) sum_u80 += u80(evaluate) ell_u65[ell_current] += sum_u65 / n_test ell_u80[ell_current] += sum_u80 / n_test logger.debug("Partial-kfold (%s, %s, %s)", ell_current, ell_u65[ell_current], ell_u80[ell_current]) ell_u65[ell_current] = ell_u65[ell_current] / cv_nfold ell_u80[ell_current] = ell_u80[ell_current] / cv_nfold writer.writerow([ell_current, ell_u65[ell_current], ell_u80[ell_current]]) file_csv.flush() logger.debug("Partial-ell (%s, %s, %s)", ell_current, ell_u65, ell_u80) file_csv.close() logger.debug("Total-ell %s %s %s", in_path, ell_u65, ell_u80)
def __init__(self, DEBUG=False): self.feature_names = [] self.label_names = [] self.feature_values = dict() self.feature_count = dict() self.label_counts = [] self.nb_labels = 0 self.training_size = 0 self.marginal_props = None self.DEBUG = DEBUG self.has_imprecise_marginal = False self._logger = create_logger("MLCNCC", DEBUG)
def performance_hold_out(in_path=None, out_path=None, model_type='lda', test_pct=0.4, n_times=10, seeds=None, scaling=False): assert os.path.exists(in_path), "Without training data, not testing" assert os.path.exists(out_path), "Without output saving performance" logger = create_logger("performance_hold_out", True) logger.info('Training data set %s, test percentage %s, model_type %s', in_path, test_pct, model_type) data = pd.read_csv(in_path, header=None) X = data.iloc[:, :-1].values if scaling: X = normalize_minmax(X) y = data.iloc[:, -1].tolist() seeds = generate_seeds(n_times) if seeds is None else seeds logger.info('Seeds generated %s', seeds) file_csv = open(out_path, 'w') writer = csv.writer(file_csv) model = __factory_model_precise(model_type, store_covariance=True) mean_u65, mean_u80 = np.array([]), np.array([]) for i in range(0, n_times): X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=test_pct, random_state=seeds[i]) sum_u65, sum_u80 = 0, 0 model.fit(X_train, y_train) n, _ = X_test.shape for j, test in enumerate(X_test): evaluate = model.predict([test]) if y_test[j] in evaluate: sum_u65 += u65(evaluate) sum_u80 += u80(evaluate) logger.info("time, u65, u80 (%s, %s, %s)", i, sum_u65 / n, sum_u80 / n) mean_u65 = np.append(mean_u65, sum_u65 / n) mean_u80 = np.append(mean_u80, sum_u80 / n) writer.writerow([-999, i, mean_u65[i], mean_u80[i]]) file_csv.flush() file_csv.close() logger.info("[total:data-set:avgResults] (%s, %s)", np.mean(mean_u65), np.mean(mean_u80))
def performance_accuracy_hold_out(in_path=None, model_type="ilda", ell_optimal=0.1, lib_path_server=None, seeds=None, DEBUG=False, scaling=False): assert os.path.exists( in_path ), "Without training data, cannot performing cross hold-out accuracy" logger = create_logger("performance_accuracy_hold_out", True) logger.info('Training dataset (%s, %s, %s)', in_path, model_type, ell_optimal) X, y = dataset_to_Xy(in_path, scaling=scaling) seeds = generate_seeds(cv_n_fold) if seeds is None else seeds logger.info('Seeds used for accuracy %s', seeds) n_time = len(seeds) mean_u65, mean_u80 = 0, 0 model = __factory_model(model_type, solver_matlab=True, add_path_matlab=lib_path_server, DEBUG=DEBUG) for k in range(0, n_time): X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.4, random_state=seeds[k]) model.learn(X=X_cv_train, y=y_cv_train, ell=ell_optimal) sum_u65, sum_u80 = 0, 0 n_test, _ = X_test.shape for i, test in enumerate(X_test): evaluate = lqa.evaluate(test) logger.debug( "(testing, ell_current, prediction, ground-truth) (%s, %s, %s, %s)", i, ell_optimal, evaluate, y_test[i]) if y_test[i] in evaluate: sum_u65 += u65(evaluate) sum_u80 += u80(evaluate) logger.debug("Partial-kfold (%s, %s, %s, %s)", ell_current, k, sum_u65 / n_test, sum_u80 / n_test) mean_u65 += sum_u65 / n_test mean_u80 += sum_u80 / n_test mean_u65 = mean_u65 / n_time mean_u80 = mean_u80 / n_time logger.debug("Total-ell (%s, %s, %s, %s)", in_path, ell_optimal, mean_u65, mean_u80)
def performance_cv_accuracy(in_path=None, model_type='lda', cv_n_fold=10, seeds=None, scaling=False): assert os.path.exists(in_path), "Without training data, not testing" data = pd.read_csv(in_path, header=None) logger = create_logger("performance_cv_accuracy", True) logger.info('Training data set %s, cv_n_fold %s, model_type %s', in_path, cv_n_fold, model_type) X = data.iloc[:, :-1].values if scaling: X = normalize_minmax(X) y = np.array(data.iloc[:, -1].tolist()) avg_u65, avg_u80 = 0, 0 seeds = generate_seeds(cv_n_fold) if seeds is None else seeds logger.info('Seeds generated %s', seeds) for time in range(cv_n_fold): # Generation a random k-fold validation. kf = KFold(n_splits=cv_n_fold, random_state=seeds[time], shuffle=True) model = __factory_model_precise(model_type, store_covariance=True) mean_u65, mean_u80 = 0, 0 for idx_train, idx_test in kf.split(y): X_cv_train, y_cv_train = X[idx_train], y[idx_train] X_cv_test, y_cv_test = X[idx_test], y[idx_test] model.fit(X_cv_train, y_cv_train) n_test = len(idx_test) sum_u65, sum_u80 = 0, 0 for i, test in enumerate(X_cv_test): evaluate = model.predict([test]) logger.debug( "(testing, prediction, ground-truth) (%s, %s, %s)", i, evaluate, y_cv_test[i]) if y_cv_test[i] in evaluate: sum_u65 += u65(evaluate) sum_u80 += u80(evaluate) mean_u65 += sum_u65 / n_test mean_u80 += sum_u80 / n_test logger.info("Time, seed, u65, u80 (%s, %s, %s, %s)", time, seeds[time], mean_u65 / cv_n_fold, mean_u80 / cv_n_fold) avg_u65 += mean_u65 / cv_n_fold avg_u80 += mean_u80 / cv_n_fold logger.info("[Total:data-set:avgResults] (%s, %s, %s)", in_path, avg_u65 / cv_n_fold, avg_u80 / cv_n_fold)
def performance_cv_accuracy_imprecise(in_path=None, model_type="ilda", ell_optimal=0.1, nb_process=2, lib_path_server=None, cv_n_fold=10, seeds=None, criterion="maximality"): assert os.path.exists(in_path), "Without training data, not testing" data = pd.read_csv(in_path) logger = create_logger("performance_cv_accuracy_imprecise", True) logger.info('Training dataset (%s, %s, %s, %s)', in_path, model_type, ell_optimal, criterion) X = data.iloc[:, :-1].values y = np.array(data.iloc[:, -1].tolist()) avg_u65, avg_u80 = 0, 0 seeds = generate_seeds(cv_n_fold) if seeds is None else seeds logger.info('Seeds used for accuracy %s', seeds) manager = ManagerWorkers(nb_process=nb_process, criterion=criterion) manager.executeAsync(model_type, lib_path_server) for time in range(cv_n_fold): kf = KFold(n_splits=cv_n_fold, random_state=seeds[time], shuffle=True) mean_u65, mean_u80 = 0, 0 for idx_train, idx_test in kf.split(y): logger.info("Splits train %s", idx_train) logger.info("Splits test %s", idx_test) X_cv_train, y_cv_train = X[idx_train], y[idx_train] X_cv_test, y_cv_test = X[idx_test], y[idx_test] mean_u65, mean_u80 = computing_training_testing_step( X_cv_train, y_cv_train, X_cv_test, y_cv_test, ell_optimal, manager, mean_u65, mean_u80) logger.debug("Partial-kfold (%s, %s, %s, %s)", ell_optimal, time, mean_u65, mean_u80) logger.info("Time, seed, u65, u80 (%s, %s, %s, %s)", time, seeds[time], mean_u65 / cv_n_fold, mean_u80 / cv_n_fold) avg_u65 += mean_u65 / cv_n_fold avg_u80 += mean_u80 / cv_n_fold manager.poisonPillTraining() logger.debug("total-ell (%s, %s, %s, %s)", in_path, ell_optimal, avg_u65 / cv_n_fold, avg_u80 / cv_n_fold)
def computing_outer_vs_exact_ranking_random_tree(out_path, nb_labels=3, nb_repeats=100, nb_process=1, seed=None, min_epsilon_param=0.05, max_epsilon_param=0.50, step_epsilon_param=0.05): assert os.path.exists(out_path), "File for putting results does not exist" logger = create_logger("computing_outer_vs_exact_inference_random_tree", True) logger.info('Results file (%s)', out_path) logger.info("(nb_repeats, nb_process, nb_labels) (%s, %s, %s)", nb_repeats, nb_process, nb_labels) logger.info( "(min_epsilon_param, max_epsilon_param, step_epsilon_param) (%s, %s, %s)", min_epsilon_param, max_epsilon_param, step_epsilon_param) if seed is None: seed = random.randrange(pow(2, 20)) random.seed(seed) logger.debug("[FIRST-STEP-SEED] SEED: %s", seed) # Create a CSV file for saving results file_csv = open(out_path, 'a') writer = csv.writer(file_csv) POOL = multiprocessing.Pool(processes=nb_process) for epsilon in np.arange(min_epsilon_param, max_epsilon_param, step_epsilon_param): target_function = partial(parallel_inferences, nb_labels=nb_labels, epsilon=epsilon) set_distance_cardinal = POOL.map(target_function, range(nb_repeats)) writer.writerow(np.hstack((epsilon, set_distance_cardinal))) file_csv.flush() logger.info("Partial-s-k_step (%s, %s)", str(epsilon), sum(set_distance_cardinal) / nb_repeats) file_csv.close() logger.info("Results Final")
def computing_time_prediction(in_path=None, ell_optimal=0.1, lib_path_server=None, model_type="ilda", criterion="maximality", k_repetition=10, seeds=None): assert os.path.exists(in_path), "Without training data, not testing" data = pd.read_csv(in_path, header=None) logger = create_logger("computing_time_prediction", True) X = data.iloc[:, :-1].values y = data.iloc[:, -1].tolist() seeds = generate_seeds(k_repetition) if seeds is None else seeds logger.info( 'Training dataset %s with maximality version (%s) and model (%s), ell_optimal (%s) and seeds %s', in_path, criterion, model_type, ell_optimal, seeds) model = __factory_model(model_type, solver_matlab=True, add_path_matlab=lib_path_server, DEBUG=False) avg = np.array([]) for k in range(k_repetition): logger.info("%s-fold repetition randomly, seed %s", k, seeds[k]) X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=seeds[k]) model.learn(X=X_train, y=y_train, ell=ell_optimal) n, _ = X_test.shape sum_time = 0 for i, test in enumerate(X_test): start = time.time() evaluate = model.evaluate(test, criterion=criterion) end = time.time() logger.info("Evaluate %s, Ground-truth %s, Time %s ", evaluate, y_test[i], (end - start)) sum_time += (end - start) avg = np.append(avg, sum_time / n) logger.info("Total time (%s, %s) and average %s and sd %s of %s testing", in_path, avg, np.mean(avg), np.std(avg), n)
def performance_cv_accuracy_imprecise(in_path=None, model_type="ilda", ell_optimal=0.1, scaling=False, lib_path_server=None, cv_n_fold=10, seeds=None, nb_process=10): assert os.path.exists( in_path ), "Without training data, cannot performing cross validation accuracy" logger = create_logger("performance_cv_accuracy_imprecise", True) logger.info('Training dataset (%s, %s, %s)', in_path, model_type, ell_optimal) X, y = dataset_to_Xy(in_path, scaling=scaling) avg_u65, avg_u80 = 0, 0 seeds = generate_seeds(cv_n_fold) if seeds is None else seeds logger.info('Seeds used for accuracy %s', seeds) manager = ManagerWorkers(nb_process=nb_process) manager.executeAsync(model_type, lib_path_server) for time in range(cv_n_fold): kf = KFold(n_splits=cv_n_fold, random_state=seeds[time], shuffle=True) mean_u65, mean_u80 = 0, 0 for idx_train, idx_test in kf.split(y): mean_u65, mean_u80, _ = computing_training_testing_step( X[idx_train], y[idx_train], X[idx_test], y[idx_test], ell_optimal, manager, mean_u65, mean_u80) logger.debug("Partial-kfold (%s, %s, %s, %s)", ell_optimal, time, mean_u65, mean_u80) logger.info("Time, seed, u65, u80 (%s, %s, %s, %s)", time, seeds[time], mean_u65 / cv_n_fold, mean_u80 / cv_n_fold) avg_u65 += mean_u65 / cv_n_fold avg_u80 += mean_u80 / cv_n_fold manager.poisonPillTraining() logger.debug("Total-ell (%s, %s, %s, %s)", in_path, ell_optimal, avg_u65 / cv_n_fold, avg_u80 / cv_n_fold)
def computing_best_imprecise_mean(in_path=None, out_path=None, seed=None, nb_kFold=10, nb_process=1, scaling=True, max_ncc_s_param=5, remove_features=None): assert os.path.exists(in_path), "Without training data, not testing" assert os.path.exists(out_path), "File for putting results does not exist" logger = create_logger("computing_best_imprecise_mean_cv", True) logger.info('Training dataset (%s, %s)', in_path, out_path) # Seeding a random value for k-fold top learning-testing data if seed is not None: random.seed(seed) seed = [random.randrange(sys.maxsize) for _ in range(nb_kFold)] logger.debug("[FIRST-STEP-SEED] SEED: %s", seed) # Create a CSV file for saving results file_csv = open(out_path, 'a') writer = csv.writer(file_csv) manager = ManagerWorkers(nb_process=nb_process) manager.executeAsync(class_model="classifip.models.mlcncc.MLCNCC") ich, cph = dict(), dict() min_discretize, max_discretize = 5, 9 for nb_disc in range(min_discretize, max_discretize): data_learning = arff.ArffFile() data_learning.load(in_path) if remove_features is not None: for r_feature in remove_features: data_learning.remove_col(r_feature) nb_labels = get_nb_labels_class(data_learning) if scaling: normalize(data_learning, n_labels=nb_labels) data_learning.discretize(discmet="eqfreq", numint=nb_disc) for time in range(nb_kFold): # 10-10 times cross-validation logger.info( "Number interval for discreteness and labels (%1d, %1d)." % (nb_disc, nb_labels)) cv_kfold = k_fold_cross_validation(data_learning, nb_kFold, randomise=True, random_seed=seed[time]) splits_s = list([]) for training, testing in cv_kfold: splits_s.append((training, testing)) logger.info("Splits %s train %s", len(training.data), training.data[0]) logger.info("Splits %s test %s", len(testing.data), testing.data[0]) disc = str(nb_disc) + "-" + str(time) ich[disc], cph[disc] = dict(), dict() for s_ncc in np.arange(0.1, max_ncc_s_param + 1, 1): ks_ncc = str(s_ncc) ich[disc][ks_ncc], cph[disc][ks_ncc] = 0, 0 for idx_fold, (training, testing) in enumerate(splits_s): ich[disc][ks_ncc], cph[disc][ ks_ncc] = computing_training_testing_step( training, testing, nb_labels, s_ncc, manager, ich[disc][ks_ncc], cph[disc][ks_ncc]) writer.writerow([ str(nb_disc), s_ncc, time, ich[disc][ks_ncc] / nb_kFold, cph[disc][ks_ncc] / nb_kFold ]) file_csv.flush() logger.debug("Partial-s-k_step (%s, %s, %s, %s, %s)", disc, s_ncc, time, ich[disc][ks_ncc] / nb_kFold, cph[disc][ks_ncc] / nb_kFold) manager.poisonPillTraining() file_csv.close() logger.debug("Results Final: %s, %s", ich, cph)
class MLCNCC(metaclass=abc.ABCMeta): # global static variables LABEL_PARTIAL_VALUE = -1 logger_global = create_logger('MLCNCC_GLOBAL', True) """ NCCBR implements the naive credal classification method using the IDM for multilabel classification with binary relevance. Base classifier NCC based on [#zaffalon2002]_ and on the improvement proposed by [#corani2010]_ :param feature_count: store counts of couples label/feature :type feature_count: dictionnary with keys label/feature :param label_counts: store counts of class labels (to instanciate prior) :type label_counts: list :param feature_names: store the names of features :type feature_names: list :param feature_values: store modalities of features :type feature_values: dictionary associating each feature name to a list """ def __init__(self, DEBUG=False): self.feature_names = [] self.label_names = [] self.feature_values = dict() self.feature_count = dict() self.label_counts = [] self.nb_labels = 0 self.training_size = 0 self.marginal_props = None self.DEBUG = DEBUG self.has_imprecise_marginal = False self._logger = create_logger("MLCNCC", DEBUG) def learn(self, learn_data_set, nb_labels): """learn the NCC for each label, mainly storing counts of feature/label pairs :param learn_data_set: learning instances :type learn_data_set: :class:`~classifip.dataset.arff.ArffFile` :param nb_labels: number of labels :type nb_labels: integer """ self.__init__() self.nb_labels = nb_labels # Initializing the counts self.feature_names = learn_data_set.attributes[:-self.nb_labels] self.label_names = np.array( learn_data_set.attributes[-self.nb_labels:]) self.feature_values = learn_data_set.attribute_data.copy() # computing precise marginal P(Y) count self.marginal_props = dict({i: dict() for i in range(self.nb_labels)}) for label_index, label_value in enumerate(self.label_names): # recovery count of class 1 and 0 label_set_one = learn_data_set.select_col_vals(label_value, ['1']) label_set_zero = learn_data_set.select_col_vals(label_value, ['0']) nb_count_one, nb_count_zero = len(label_set_one.data), len( label_set_zero.data) # if we works with missing label (label=-1: missing), the marginal values changes # (1) Computing label proportions self.marginal_props[label_index][0] = nb_count_zero self.marginal_props[label_index][1] = nb_count_one self.marginal_props[label_index]['all'] = float(nb_count_one + nb_count_zero) # (2) Computing counting label|attributes for feature in self.feature_names: count_vector_one, count_vector_zero = [], [] feature_index = learn_data_set.attributes.index(feature) for feature_value in learn_data_set.attribute_data[feature]: nb_items_one = [ row[feature_index] for row in label_set_one.data ].count(feature_value) count_vector_one.append(nb_items_one) nb_items_zero = [ row[feature_index] for row in label_set_zero.data ].count(feature_value) count_vector_zero.append(nb_items_zero) self.feature_count[label_value + '|in|' + feature] = count_vector_one self.feature_count[label_value + '|out|' + feature] = count_vector_zero # (3) Computing counting label|other_labels for label_feature in self.label_names: if label_feature != label_value: label_feature_index = learn_data_set.attributes.index( label_feature) count_vector_one, count_vector_zero = [], [] for label_feature_value in learn_data_set.attribute_data[ label_feature]: nb_items_one = [ row[label_feature_index] for row in label_set_one.data ].count(label_feature_value) count_vector_one.append(nb_items_one) nb_items_zero = [ row[label_feature_index] for row in label_set_zero.data ].count(label_feature_value) count_vector_zero.append(nb_items_zero) self.feature_count[label_value + '|in|' + label_feature] = count_vector_one self.feature_count[label_value + '|out|' + label_feature] = count_vector_zero @abc.abstractmethod def evaluate(self, test_dataset, ncc_epsilon=0.001, ncc_s_param=2.0, with_imprecise_marginal=False, precision=None): pass @staticmethod def __random_set_labels_index(dataset, nb_labels, seed_random_label=None): """ :param dataset: :param seed_random_label: :return: """ # Generation random position for chain label if seed_random_label is None: seed_random_label = random.randrange(pow(2, 20)) MLCNCC.logger_global.info( "[__random_set_labels_index] seed random label (%s)", seed_random_label) label_names = np.array(dataset.attributes[-nb_labels:]) origin_indices = dict(zip(label_names, range(nb_labels))) np.random.seed(seed_random_label) np.random.shuffle(label_names) MLCNCC.logger_global.info( "[__random_set_labels_index] origin index (%s)", origin_indices) MLCNCC.logger_global.info( "[__random_set_labels_index] shuffle labels (%s)", label_names) return origin_indices, label_names @staticmethod def shuffle_labels(dataset, nb_labels, seed_random_label=None): """ :param dataset: (mutable) :type classifip.dataset.arff.ArffFile with string values for columns (after discretization data) (warning: does not work with mixed value (float,string)) :param nb_labels: :param seed_random_label: randomly mixing labels Y1, Y2, ..., Ym :type seed_random_label: float :return: <void> modify structure of dataset parameter """ nb_cols = len(dataset.attributes) origin_indices, label_names = MLCNCC.__random_set_labels_index( dataset, nb_labels, seed_random_label) np_data = np.array(dataset.data) new_data_labels = np.empty((len(dataset.data), nb_labels), dtype='<U1') for index, label in enumerate(label_names): orig_idx = origin_indices[label] new_data_labels[:, index] = np.array(np_data[:, nb_cols - nb_labels + orig_idx]) dataset.attributes[nb_cols - nb_labels + index] = label np_data[:, -nb_labels:] = new_data_labels dataset.data = np_data.tolist() @staticmethod def shuffle_labels_train_testing(train_dataset, testing_dataset, nb_labels, seed_random_label=None): """ :param train_dataset: (mutable) :type classifip.dataset.arff.ArffFile :param testing_dataset: (mutable) :type classifip.dataset.arff.ArffFile :param nb_labels: :param seed_random_label: :return: """ nb_cols = len(train_dataset.attributes) origin_indices, label_names = MLCNCC.__random_set_labels_index( train_dataset, nb_labels, seed_random_label) np_data_train = np.array(train_dataset.data) np_data_test = np.array(testing_dataset.data) new_ltrain = np.empty((len(train_dataset.data), nb_labels), dtype='<U1') new_ltest = np.empty((len(testing_dataset.data), nb_labels), dtype='<U1') for index, label in enumerate(label_names): orig_idx = origin_indices[label] # exchange columns training dataset new_ltrain[:, index] = np.array( np_data_train[:, nb_cols - nb_labels + orig_idx]) train_dataset.attributes[nb_cols - nb_labels + index] = label # exchange columns testing dataset new_ltest[:, index] = np.array(np_data_test[:, nb_cols - nb_labels + orig_idx]) train_dataset.attributes[nb_cols - nb_labels + index] = label np_data_train[:, -nb_labels:] = new_ltrain np_data_test[:, -nb_labels:] = new_ltest train_dataset.data = np_data_train.tolist() testing_dataset.data = np_data_test.tolist() @staticmethod def missing_labels_learn_data_set(learn_data_set, nb_labels, missing_pct=0.0): """ :param learn_data_set: :type learn_data_set: arff :param nb_labels: number of labels :type nb_labels: integer :param missing_pct: percentage of missing labels :type missing_pct: float :return: """ if missing_pct < 0.0 or missing_pct > 1.0: raise Exception( 'Negative percentage or higher than one of missing label.') if missing_pct > 0.0: label_names = learn_data_set.attributes[-nb_labels:] for label_value in label_names: missing_label_index = np.random.choice( len(learn_data_set.data), int(len(learn_data_set.data) * missing_pct), replace=False) col_ind = learn_data_set.attributes.index(label_value) for index, value in enumerate(learn_data_set.data): if index in missing_label_index: value[col_ind] = '-1' @staticmethod def noise_labels_learn_data_set(learn_data_set, nb_labels, noise_label_pct, noise_label_type, noise_label_prob): """ :param learn_data_set: :type learn_data_set: arff :param nb_labels: number of labels :type nb_labels: integer :param noise_label_pct: percentage noise labels :type noise_label_pct: float :param noise_label_type: type of noise label flipping (1) reverse change 1-0 (2) with probability p label relevant 1 (bernoulli trials) (3) label relevant 1 with probability greater than p (uniform randomly) :type noise_label_type: integer :param noise_label_prob: probability to flip a label :type noise_label_prob: float """ if noise_label_type not in [1, 2, 3, -1]: raise Exception( 'Configuration noise label is not implemented yet.') if noise_label_pct < 0.0 or noise_label_pct > 1.0: raise Exception( 'Negative percentage or higher than one of noise label.') if noise_label_pct > 0.0 and noise_label_type in [1, 2, 3]: size_learn_data = len(learn_data_set.data) set_label_index = np.zeros((size_learn_data, nb_labels), dtype=int) for i in range(nb_labels): noise_index_by_label = np.random.choice(size_learn_data, int(size_learn_data * noise_label_pct), replace=False) if noise_label_type == 1: set_label_index[noise_index_by_label, i] = 1 elif noise_label_type == 2: noise_label_flip = np.random.choice( [0, 1], size=int(size_learn_data * noise_label_pct), p=[1 - noise_label_prob, noise_label_prob]) set_label_index[noise_index_by_label, i] = 3 - noise_label_flip # 2:=1 and 3:=0 elif noise_label_type == 3: noise_uniform_rand = np.random.uniform( size=int(size_learn_data * noise_label_pct)) noise_uniform_rand[ noise_uniform_rand >= noise_label_prob] = 1 noise_uniform_rand[ noise_uniform_rand < noise_label_prob] = 0 set_label_index[ noise_index_by_label, i] = 3 - noise_uniform_rand # 2:=1 and 3:=0 if noise_label_type == 1: for i, instance in enumerate(learn_data_set.data): noise_label_by_inst = abs( set_label_index[i, :] - np.array(instance[-nb_labels:], dtype=int)) instance[-nb_labels:] = noise_label_by_inst.astype( '<U1').tolist() elif noise_label_type == 2 or noise_label_type == 3: for i, instance in enumerate(learn_data_set.data): idx_zero = np.where(set_label_index[i, :] == 3) idx_one = np.where(set_label_index[i, :] == 2) noise_labels_value = np.array(instance[-nb_labels:], dtype=int) noise_labels_value[idx_zero] = 0 noise_labels_value[idx_one] = 1 instance[-nb_labels:] = noise_labels_value.astype( '<U1').tolist() else: raise Exception( 'Configuration noise label is not implemented yet.') def lower_upper_probability(self, feature, feature_value, ncc_s_param, feature_class_name, ncc_epsilon): """ ... note: zero float division can happen if too many input features To avoid probability zero, we use the Laplace Smoothing https://en.wikipedia.org/wiki/Additive_smoothing :param feature: :param feature_value: :param ncc_s_param: :param feature_class_name: :param ncc_epsilon: :return: """ def __restricting_idm(probability, ncc_epsilon_ip, len_features): return (1 - ncc_epsilon_ip ) * probability + ncc_epsilon_ip / len_features f_val_index = self.feature_values[feature].index(feature_value) # num_items = float(sum(self.feature_count[feature_class_name])) n_fi_c = self.feature_count[feature_class_name][ f_val_index] # n(f_i|c) len_fi = len(self.feature_count[feature_class_name]) # |F_i| # n(f_i|c)/(n(c)+s), lower probability: t(f_1|c)->0, t(c)->1 try: p_lower = (n_fi_c / (num_items + ncc_s_param)) except ZeroDivisionError: p_lower = (n_fi_c + 1) / (num_items + ncc_s_param + len_fi) # (n(f_i|c)+s)/(n(c)+s), upper probability: t(f_1|c)->1, t(c)->1 try: p_upper = ((n_fi_c + ncc_s_param) / (num_items + ncc_s_param)) except ZeroDivisionError: p_upper = ((n_fi_c + ncc_s_param + 1) / (num_items + ncc_s_param + len_fi)) # some regularization with epsilon p_lower = __restricting_idm(p_lower, ncc_epsilon, len_fi) p_upper = __restricting_idm(p_upper, ncc_epsilon, len_fi) return p_lower, p_upper def lower_upper_marginal_Y(self, idx_label_to_infer, value_label_to_infer, ncc_s_param): # @salmuz: missing apply Laplace Smoothing when n_label_data=0 and ncc_s_parm = 0 count_label = self.marginal_props[idx_label_to_infer][ value_label_to_infer] n_label_data = self.marginal_props[idx_label_to_infer]["all"] p_lower = count_label / (n_label_data + ncc_s_param) p_upper = (count_label + ncc_s_param) / (n_label_data + ncc_s_param) self._logger.debug( "[Bound-Marginal] (idx_label_to_infer, p_lower, p_upper ) (%s, %s, %s)", idx_label_to_infer, p_lower, p_upper) return p_lower, p_upper def lower_upper_probability_feature(self, idx_label_to_infer, item, ncc_s_param, ncc_epsilon): # (n(c)+st(c))/(N+s), with s=0 (i.e. prior probabilities precise, P(Y)) if self.has_imprecise_marginal: l_denominator_0, u_denominator_0 = self.lower_upper_marginal_Y( idx_label_to_infer, 0, ncc_s_param) l_numerator_1, u_numerator_1 = self.lower_upper_marginal_Y( idx_label_to_infer, 1, ncc_s_param) else: all_bits_label = self.marginal_props[idx_label_to_infer]["all"] # Applying Laplace Smoothing (where |C| = 2, binary case): # P(Y_i = idx_label_to_infer) = (n(idx_label_to_infer) + 1)/(n + |C|) prop_marginal_label_1 = (self.marginal_props[idx_label_to_infer][1] + 1) / (all_bits_label + 2) u_denominator_0 = 1 - prop_marginal_label_1 # \overline P(Yj=0) l_denominator_0 = 1 - prop_marginal_label_1 # \underline P(Yj=0) u_numerator_1 = prop_marginal_label_1 # \overline P(Yj=1) l_numerator_1 = prop_marginal_label_1 # \underline P(Yj=1) for f_index, feature in enumerate(self.feature_names): # computation of denominator (label=1) feature_class_name = self.label_names[ idx_label_to_infer] + '|in|' + feature # (f_i, c=1) p_lower, p_upper = self.lower_upper_probability( feature, item[f_index], ncc_s_param, feature_class_name, ncc_epsilon) l_numerator_1 = l_numerator_1 * p_lower # prod \underline{P}(f_i|c=1) u_numerator_1 = u_numerator_1 * p_upper # prod \overline{P}(f_i|c=1) # computation of numerator (label=0) feature_class_name = self.label_names[ idx_label_to_infer] + '|out|' + feature p_lower, p_upper = self.lower_upper_probability( feature, item[f_index], ncc_s_param, feature_class_name, ncc_epsilon) l_denominator_0 = l_denominator_0 * p_lower # prod \underline{P}(f_i|c=0) u_denominator_0 = u_denominator_0 * p_upper # prod \overline{P}(f_i|c=0) return u_numerator_1, l_numerator_1, u_denominator_0, l_denominator_0 def lower_upper_probability_labels(self, idx_label_to_infer, augmented_labels, ncc_s_param, ncc_epsilon, idx_chain_predict_labels=None): """ :param idx_label_to_infer: name of label selected :param augmented_labels: list of characters values '0' or '1' :param ncc_s_param: :param ncc_epsilon: :param idx_chain_predict_labels: :return: """ u_numerator_1, l_numerator_1, u_denominator_0, l_denominator_0 = 1, 1, 1, 1 if idx_chain_predict_labels is None: dependant_labels = enumerate( self.label_names[:len(augmented_labels)]) else: dependant_labels = zip(idx_chain_predict_labels, self.label_names[idx_chain_predict_labels]) self._logger.debug( "[Bound-Labels] (label_to_infer, augmented_labels, idx_chain_predict_labels ) (%s, %s, %s)", self.label_names[idx_label_to_infer], augmented_labels, idx_chain_predict_labels) for l_index, label in dependant_labels: label_predicted_value = str(augmented_labels[l_index]) # computation of denominator (label=1) label_class_name = self.label_names[ idx_label_to_infer] + '|in|' + label # (l_i=1, c=1) p_lower, p_upper = self.lower_upper_probability( label, label_predicted_value, ncc_s_param, label_class_name, ncc_epsilon) l_numerator_1 = l_numerator_1 * p_lower # prod \underline{P}(f_i|c=1) u_numerator_1 = u_numerator_1 * p_upper # prod \overline{P}(f_i|c=1) # computation of numerator (label=0) label_class_name = self.label_names[ idx_label_to_infer] + '|out|' + label # (l_i=0, c=0) p_lower, p_upper = self.lower_upper_probability( label, label_predicted_value, ncc_s_param, label_class_name, ncc_epsilon) l_denominator_0 = l_denominator_0 * p_lower # prod \underline{P}(f_i|c=0) u_denominator_0 = u_denominator_0 * p_upper # prod \overline{P}(f_i|c=0) return u_numerator_1, l_numerator_1, u_denominator_0, l_denominator_0 def lower_upper_cond_probability(self, idx_label_to_infer, instance, augmented_labels, ncc_s_param, ncc_epsilon, idx_chain_predict_labels=None): """ .. note:: TO DO: To avoid probability zero, we use the Laplace Smoothing https://en.wikipedia.org/wiki/Additive_smoothing :param idx_label_to_infer: :param instance: :param augmented_labels: :param ncc_s_param: :param ncc_epsilon: :param idx_chain_predict_labels: :return: """ u_numerator_1, l_numerator_1, u_denominator_0, l_denominator_0 = \ self.lower_upper_probability_feature(idx_label_to_infer, instance, ncc_s_param, ncc_epsilon) u_numerator_label_1, l_numerator_label_1, u_denominator_label_0, l_denominator_label_0 = \ self.lower_upper_probability_labels(idx_label_to_infer, augmented_labels, ncc_s_param, ncc_epsilon, idx_chain_predict_labels) u_numerator_1 = u_numerator_1 * u_numerator_label_1 l_numerator_1 = l_numerator_1 * l_numerator_label_1 u_denominator_0 = u_denominator_0 * u_denominator_label_0 l_denominator_0 = l_denominator_0 * l_denominator_label_0 return u_numerator_1, l_numerator_1, u_denominator_0, l_denominator_0
def computing_best_imprecise_mean(in_path=None, out_path=None, cv_nfold=10, model_type="ilda", test_size=0.4, from_ell=0.1, to_ell=1.0, by_ell=0.1, seeds=None, lib_path_server=None, nb_process=2, n_sampling=10, skip_n_sample=0, criterion="maximality", scaling=False): assert os.path.exists(in_path), "Without training data, not testing" assert os.path.exists(out_path), "File for putting results does not exist" logger = create_logger("computing_best_imprecise_mean_sampling", True) logger.info('Training dataset (%s, %s, %s)', in_path, model_type, criterion) logger.info( 'Parameters (size, ells, nbProcess, sampling, nSkip) (%s, %s, %s, %s, %s, %s, %s)', test_size, from_ell, to_ell, by_ell, nb_process, n_sampling, skip_n_sample) data = pd.read_csv(in_path, header=None) X = data.iloc[:, :-1].values if scaling: X = normalize_minmax(X) y = np.array(data.iloc[:, -1].tolist()) # Seed for get back up if process is killed seeds = generate_seeds(n_sampling) if seeds is None else seeds logger.debug("MODEL: %s, SEED: %s", model_type, seeds) # Create a CSV file for saving results file_csv = open(out_path, 'a') writer = csv.writer(file_csv) manager = ManagerWorkers(nb_process=nb_process, criterion=criterion) manager.executeAsync(model_type, lib_path_server) acc_u80, acc_u65 = dict(), dict() for sampling in range(min(n_sampling, len(seeds))): X_learning, X_testing, y_learning, y_testing = \ train_test_split(X, y, test_size=test_size, random_state=seeds[sampling]) logger.info("Splits %s learning %s", sampling, y_learning) logger.info("Splits %s testing %s", sampling, y_testing) # n-Skipping sampling and reboot parameter from_ell to 0.01 next sampling if skip_n_sample != 0 and sampling > skip_n_sample: from_ell = 0.01 # n-Skipping sampling testing (purpose for parallel computing) if sampling >= skip_n_sample: kf = KFold(n_splits=cv_nfold, random_state=None, shuffle=True) ell_u65, ell_u80, splits = dict(), dict(), list([]) for idx_train, idx_test in kf.split(y_learning): splits.append((idx_train, idx_test)) logger.info("Sampling %s Splits %s train %s", sampling, len(splits), idx_train) logger.info("Sampling %s Splits %s test %s", sampling, len(splits), idx_test) for ell_current in np.arange(from_ell, to_ell, by_ell): ell_u65[ell_current], ell_u80[ell_current] = 0, 0 logger.info("ELL_CURRENT %s", ell_current) for idx_train, idx_test in splits: logger.info("Splits train %s", idx_train) logger.info("Splits test %s", idx_test) X_cv_train, y_cv_train = X_learning[idx_train], y_learning[ idx_train] X_cv_test, y_cv_test = X_learning[idx_test], y_learning[ idx_test] # Computing accuracy testing for cross-validation step ell_u65[ell_current], ell_u80[ell_current] = \ computing_training_testing_step(X_cv_train, y_cv_train, X_cv_test, y_cv_test, ell_current, manager, ell_u65[ell_current], ell_u80[ell_current]) logger.info("Partial-kfold (%s, %s, %s)", ell_current, ell_u65[ell_current], ell_u80[ell_current]) ell_u65[ell_current] = ell_u65[ell_current] / cv_nfold ell_u80[ell_current] = ell_u80[ell_current] / cv_nfold writer.writerow([ ell_current, sampling, ell_u65[ell_current], ell_u80[ell_current] ]) file_csv.flush() logger.debug("Partial-ell-sampling (%s, %s, %s, %s)", ell_current, sampling, ell_u65, ell_u80) logger.debug("Total-ell-sampling (%s, %s, %s, %s)", in_path, sampling, ell_u65, ell_u80) # Computing optimal ells for using in testing step acc_ellu80 = max(ell_u80.values()) acc_ellu65 = max(ell_u65.values()) ellu80_opts = [k for k, v in ell_u80.items() if v == acc_ellu80] ellu65_opts = [k for k, v in ell_u65.items() if v == acc_ellu65] acc_u65[sampling], acc_u80[sampling] = 0, 0 n_ell80_opts, n_ell65_opts = len(ellu80_opts), len(ellu65_opts) for ellu80_opt in ellu80_opts: logger.info("ELL_OPTIMAL_SAMPLING_U80 %s", ellu80_opt) _, acc_u80[sampling] = \ computing_training_testing_step(X_learning, y_learning, X_testing, y_testing, ellu80_opt, manager, 0, acc_u80[sampling]) for ellu65_opt in ellu65_opts: logger.info("ELL_OPTIMAL_SAMPLING_U65 %s", ellu65_opt) acc_u65[sampling], _ = \ computing_training_testing_step(X_learning, y_learning, X_testing, y_testing, ellu65_opt, manager, acc_u65[sampling], 0) acc_u65[sampling] = acc_u65[sampling] / n_ell65_opts acc_u80[sampling] = acc_u80[sampling] / n_ell80_opts writer.writerow( [-999, sampling, acc_u65[sampling], acc_u80[sampling]]) file_csv.flush() logger.debug("Partial-ell-2step (%s, %s, %s, %s)", -999, ellu80_opts, acc_u65[sampling], acc_u80[sampling]) writer.writerow([ -9999, -9, np.mean(list(acc_u65.values())), np.mean(list(acc_u80.values())) ]) manager.poisonPillTraining() file_csv.close() logger.debug("Total-accuracy (%s, %s, %s)", in_path, acc_u65, acc_u80) logger.debug("Total-avg-accuracy (%s, %s, %s)", in_path, np.mean(list(acc_u65.values())), np.mean(list(acc_u80.values())))
def experiments_binr_vs_imprecise(in_path=None, out_path=None, seed=None, missing_pct=0.0, noise_label_pct=0.0, noise_label_type=-1, noise_label_prob=0.5, nb_kFold=10, nb_process=1, scaling=False, epsilon_rejects=None, min_ncc_s_param=0.5, max_ncc_s_param=6.0, step_ncc_s_param=1.0, remove_features=None, k_nearest_neighbors=None): """ Experiments with binary relevant imprecise and missing/noise data. :param in_path: :param out_path: :param seed: :param missing_pct: percentage of missing labels :param noise_label_pct: percentage of noise labels :param noise_label_type: type of perturbation noise :param noise_label_prob: probaiblity of noise labesl :param nb_kFold: :param nb_process: number of process in parallel :param scaling: scaling X input space (used for kkn-nccbr classifier) :param epsilon_rejects: epsilon of reject option (for comparing with imprecise version) :param min_ncc_s_param: minimum value of imprecise parameter s :param max_ncc_s_param: maximum value of imprecise parameter s :param step_ncc_s_param: discretization step of parameter s :param remove_features: features not to take into account :param k_nearest_neighbors: k*radius_distance_pairwise_all_instance, how big is ball containing neighbors. ...note:: TODO: Bug when the missing percentage is higher (90%) to fix. """ assert os.path.exists(in_path), "Without training data, not testing" assert os.path.exists(out_path), "File for putting results does not exist" assert k_nearest_neighbors is not None, "None value, it needs a value for the knn algorithm" assert k_nearest_neighbors > 0, "Need a value for the knn algorithm" logger = create_logger("computing_best_imprecise_mean", True) logger.info('Training dataset (%s, %s)', in_path, out_path) logger.info( "(min_ncc_s_param, max_ncc_s_param, step_ncc_s_param) (%s, %s, %s)", min_ncc_s_param, max_ncc_s_param, step_ncc_s_param) logger.info( "(scaling, remove_features, process, epsilon_rejects) (%s, %s, %s, %s)", scaling, remove_features, nb_process, epsilon_rejects) logger.info( "(missing_pct, noise_label_pct, noise_label_type, noise_label_prob) (%s, %s, %s, %s)", missing_pct, noise_label_pct, noise_label_type, noise_label_prob) logger.info("( k_nearest_neighbors) (%s)", k_nearest_neighbors) # Seeding a random value for k-fold top learning-testing data if seed is None: seed = [random.randrange(sys.maxsize) for _ in range(nb_kFold)] logger.debug("[FIRST-STEP-SEED] SEED: %s", seed) # Create a CSV file for saving results file_csv = open(out_path, 'a') writer = csv.writer(file_csv) manager = ManagerWorkers(nb_process=nb_process, fun_prediction=skeptical_prediction) manager.executeAsync( class_model="classifip.models.mlc.knnnccbr.KNN_NCC_BR") ich_skep, cph_skep, acc_prec = dict(), dict(), dict() ich_reject, cph_reject = dict(), dict() min_discretize, max_discretize = 5, 7 for nb_disc in range(min_discretize, max_discretize): data_learning, nb_labels = init_dataset(in_path, remove_features, scaling) p_dimension = len(data_learning.data[0]) - nb_labels # saving continuous data and index instances for KNN-NCC-BR classification data_continuous = data_learning.make_clone() # adding raw-index to each instance if we use knn-ncc for idx, row_instance in enumerate(data_learning.data): row_instance.insert(p_dimension + nb_labels, idx) data_learning.discretize(discmet="eqfreq", numint=nb_disc) for time in range(nb_kFold): # 10-10 times cross-validation logger.info( "Number interval for discreteness and labels (%1d, %1d)." % (nb_disc, nb_labels)) cv_kfold = k_fold_cross_validation(data_learning, nb_kFold, randomise=True, random_seed=seed[time]) splits_s = list([]) for training, testing in cv_kfold: # making a clone because it send the same address memory splits_s.append((training.make_clone(), testing.make_clone())) logger.info("Splits %s train %s", len(training.data), training.data[0][1:4]) logger.info("Splits %s test %s", len(testing.data), testing.data[0][1:4]) disc = str(nb_disc) + "-" + str(time) ich_skep[disc], cph_skep[disc], acc_prec[disc] = dict(), dict( ), dict() ich_reject[disc], cph_reject[disc] = dict(), dict() for s_ncc in np.arange(min_ncc_s_param, max_ncc_s_param, step_ncc_s_param): ks_ncc = str(s_ncc) init_scores(ks_ncc, ich_skep[disc], cph_skep[disc], acc_prec[disc], ich_reject[disc], cph_reject[disc], epsilon_rejects) for idx_fold, (training, testing) in enumerate(splits_s): logger.info("Splits %s train %s", len(training.data), training.data[0][1:4]) logger.info("Splits %s test %s", len(testing.data), testing.data[0][1:4]) rs = computing_training_testing_step( training, testing, missing_pct, noise_label_pct, noise_label_type, noise_label_prob, nb_labels, p_dimension, s_ncc, manager, epsilon_rejects, ich_skep[disc][ks_ncc], cph_skep[disc][ks_ncc], acc_prec[disc][ks_ncc], ich_reject[disc][ks_ncc], cph_reject[disc][ks_ncc], data_continuous, k_nearest_neighbors) ich_skep[disc][ks_ncc], cph_skep[disc][ks_ncc] = rs[0], rs[ 1] acc_prec[disc][ks_ncc] = rs[2] ich_reject[disc][ks_ncc], cph_reject[disc][ks_ncc] = rs[ 3], rs[4] logger.debug("Partial-s-k_step (acc, ich_skep) (%s, %s)", acc_prec[disc][ks_ncc], ich_skep[disc][ks_ncc]) ich_skep[disc][ks_ncc] = ich_skep[disc][ks_ncc] / nb_kFold cph_skep[disc][ks_ncc] = cph_skep[disc][ks_ncc] / nb_kFold acc_prec[disc][ks_ncc] = acc_prec[disc][ks_ncc] / nb_kFold _partial_saving = [ str(nb_disc), s_ncc, time, ich_skep[disc][ks_ncc], cph_skep[disc][ks_ncc], acc_prec[disc][ks_ncc] ] if epsilon_rejects is not None: _reject_ich = [ e / nb_kFold for e in ich_reject[disc][ks_ncc].values() ] _reject_cph = [ e / nb_kFold for e in cph_reject[disc][ks_ncc].values() ] _partial_saving = _partial_saving + _reject_ich + _reject_cph else: _reject_ich, _reject_cph = [], [] logger.debug("Partial-s-k_step reject values (%s)", ich_reject[disc][ks_ncc]) writer.writerow(_partial_saving) file_csv.flush() logger.debug( "Partial-s-k_step (disc, s, time, ich_skep, cph_skep, acc, ich_reject, cph_reject)" "(%s, %s, %s, %s, %s, %s, %s, %s)", disc, s_ncc, time, ich_skep[disc][ks_ncc], cph_skep[disc][ks_ncc], acc_prec[disc][ks_ncc], _reject_ich, _reject_cph) manager.poisonPillTraining() file_csv.close() logger.debug("Results Final: %s, %s, %s", ich_skep, cph_skep, acc_prec)
def __init__(self, DEBUG=False): super(BinaryILogisticLasso, self).__init__(DEBUG) self._logger = create_logger("BinaryILogistic", DEBUG) self._lasso_models = None self._precise_logit = None self._gammas = None
def __init__(self, DEBUG=False): super(MLChaining, self).__init__(DEBUG) self._logger = create_logger("MLChaining", DEBUG)
def computing_best_imprecise_mean(in_path=None, out_path=None, lib_path_server=None, model_type="ilda", from_ell=0.1, to_ell=1.0, by_ell=0.1, seed=None, cv_kfold_first=10, nb_process=2, skip_nfold=0, cv_kfold_second=10, seed_second=None, scaling=False): assert os.path.exists(in_path), "Without training data, not testing" assert os.path.exists(out_path), "File for putting results does not exist" logger = create_logger("computing_best_imprecise_mean_cv", True) logger.info('Training dataset (%s, %s, %s)', in_path, out_path, model_type) logger.info('Parameters (ells, nbProcess, skip_nfold, cv_kfold_second) (%s, %s, %s, %s, %s, %s)', from_ell, to_ell, by_ell, nb_process, skip_nfold, cv_kfold_second) data = pd.read_csv(in_path, header=None) X = data.iloc[:, :-1].values if scaling: X = normalize_minmax(X) y = np.array(data.iloc[:, -1].tolist()) # Seeding a random value for k-fold top learning-testing data seed = random.randrange(pow(2, 30)) if seed is None else seed logger.debug("[FIRST-STEP-SEED] MODEL: %s, SEED: %s", model_type, seed) # Create a CSV file for saving results file_csv = open(out_path, 'a') writer = csv.writer(file_csv) manager = ManagerWorkers(nb_process=nb_process) manager.executeAsync(model_type, lib_path_server) kfFirst = KFold(n_splits=cv_kfold_first, random_state=seed, shuffle=True) acc_u80, acc_u65, idx_kfold = dict(), dict(), 0 seed_2step = generate_seeds(cv_kfold_second) if seed_second is None else seed_second logger.debug("[SECOND-STEP-SEEDS] MODEL: %s, SEED: %s, SECOND-SEED: %s", model_type, seed, seed_2step) for idx_learning, idx_testing in kfFirst.split(y): ell_u65, ell_u80 = dict(), dict() # Generate sampling k-fold (learning, testing) for optimal ell parameters X_learning, y_learning = X[idx_learning], y[idx_learning] X_testing, y_testing = X[idx_testing], y[idx_testing] logger.info("Splits %s learning %s", idx_kfold, idx_learning) logger.info("Splits %s testing %s", idx_kfold, idx_testing) # # n-Skipping sampling and reboot parameter from_ell to 0.01 next sampling if skip_nfold != 0 and idx_kfold > skip_nfold: from_ell = 0.01 # n-Skipping fold cross-validation (purpose for parallel computing) if idx_kfold >= skip_nfold: # Generate same k-fold-second (train, test) for impartially computing accuracy all ell parameters splits_ell = list([]) logger.debug("[2-STEP-SEED] MODEL: %s, SEED: %s OF FIRST STEP %s", model_type, seed_2step[idx_kfold], seed) kfSecond = KFold(n_splits=cv_kfold_second, random_state=seed_2step[idx_kfold], shuffle=True) for idx_learn_train, idx_learn_test in kfSecond.split(y_learning): splits_ell.append((idx_learn_train, idx_learn_test)) logger.info("Splits %s train %s", len(splits_ell), idx_learn_train) logger.info("Splits %s test %s", len(splits_ell), idx_learn_test) for ell_current in np.arange(from_ell, to_ell, by_ell): ell_u65[ell_current], ell_u80[ell_current] = 0, 0 logger.info("ELL_CURRENT %s", ell_current) for idx_learn_train, idx_learn_test in splits_ell: logger.info("Splits step train %s", idx_learn_train) logger.info("Splits step test %s", idx_learn_test) X_cv_train, y_cv_train = X_learning[idx_learn_train], y_learning[idx_learn_train] X_cv_test, y_cv_test = X_learning[idx_learn_test], y_learning[idx_learn_test] ell_u65[ell_current], ell_u80[ell_current], _ = \ computing_training_testing_step(X_cv_train, y_cv_train, X_cv_test, y_cv_test, ell_current, manager, ell_u65[ell_current], ell_u80[ell_current]) logger.info("Partial-kfold (%s, %s, %s)", ell_current, ell_u65[ell_current], ell_u80[ell_current]) ell_u65[ell_current] = ell_u65[ell_current] / cv_kfold_first ell_u80[ell_current] = ell_u80[ell_current] / cv_kfold_first writer.writerow([ell_current, idx_kfold, ell_u65[ell_current], ell_u80[ell_current]]) file_csv.flush() logger.debug("Partial-ell-k-step (%s, %s, %s)", idx_kfold, ell_u65[ell_current], ell_u80[ell_current]) logger.debug("Total-ell-k-step (%s, %s, %s, %s)", in_path, idx_kfold, ell_u65, ell_u80) # Computing optimal ells for using in testing step acc_ell_u80 = max(ell_u80.values()) acc_ell_u65 = max(ell_u65.values()) ell_u80_opts = [k for k, v in ell_u80.items() if v == acc_ell_u80] ell_u65_opts = [k for k, v in ell_u65.items() if v == acc_ell_u65] acc_u65[idx_kfold], acc_u80[idx_kfold] = 0, 0 n_ell80_opts, n_ell65_opts = len(ell_u80_opts), len(ell_u65_opts) for ell_u80_opt in ell_u80_opts: logger.info("ELL_OPTIMAL_CV_U80 %s", ell_u80_opt) _, _acc_u80, _ = \ computing_training_testing_step(X_learning, y_learning, X_testing, y_testing, ell_u80_opt, manager, 0, 0) acc_u80[idx_kfold] += _acc_u80 writer.writerow([-999, -8, ell_u80_opt, _acc_u80]) for ell_u65_opt in ell_u65_opts: logger.info("ELL_OPTIMAL_CV_U65 %s", ell_u65_opt) _acc_u65, _, _ = \ computing_training_testing_step(X_learning, y_learning, X_testing, y_testing, ell_u65_opt, manager, 0, 0) acc_u65[idx_kfold] += _acc_u65 writer.writerow([-999, -7, ell_u65_opt, _acc_u65]) acc_u65[idx_kfold] = acc_u65[idx_kfold] / n_ell65_opts acc_u80[idx_kfold] = acc_u80[idx_kfold] / n_ell80_opts writer.writerow([-999, idx_kfold, acc_u65[idx_kfold], acc_u80[idx_kfold]]) file_csv.flush() logger.debug("Partial-ell-2step (u80, u65, accs) (%s, %s, %s, %s, %s)", -999, ell_u80_opts, ell_u65_opts, acc_u65[idx_kfold], acc_u80[idx_kfold]) idx_kfold += 1 writer.writerow([-9999, -9, np.mean(list(acc_u65.values())), np.mean(list(acc_u80.values()))]) manager.poisonPillTraining() file_csv.close() logger.debug("Total-accuracy (%s, %s, %s)", in_path, acc_u65, acc_u80) logger.debug("Total-avg-accuracy (%s, %s, %s)", in_path, np.mean(list(acc_u65.values())), np.mean(list(acc_u80.values())))
def performance_qda_regularized(in_path=None, out_path=None, cv_n_fold=10, seeds=None, from_alpha=0, to_alpha=2.0, by_alpha=0.01, scaling=False): assert os.path.exists(in_path), "Without training data, not testing" assert os.path.exists(out_path), "Without output saving performance" data = pd.read_csv(in_path, header=None) logger = create_logger("performance_qda_regularized", True) logger.info('Training data set %s, cv_n_fold %s, model_type %s', in_path, cv_n_fold, "qda") X = data.iloc[:, :-1].values if scaling: X = normalize_minmax(X) y = np.array(data.iloc[:, -1].tolist()) seeds = generate_seeds(cv_n_fold) if seeds is None else seeds logger.info('Seeds generated %s', seeds) file_csv = open(out_path, 'a') writer = csv.writer(file_csv) alphas = np.arange(from_alpha, to_alpha, by_alpha) writer.writerow(alphas) qda_regularized = [None] * len(alphas) for idx, alpha in enumerate(alphas): qda_regularized[idx] = __factory_model_precise("qda", store_covariance=True, reg_param=alpha) # Generation a random k-fold validation. kf_second = KFold(n_splits=cv_n_fold, random_state=None, shuffle=True) ikfold, accuracy, best_alphas = 0, [0] * cv_n_fold, [0] * cv_n_fold for idx_learning, idx_testing in kf_second.split(y): X_training, y_training = X[idx_learning], y[idx_learning] X_testing, y_testing = X[idx_testing], y[idx_testing] kf = KFold(n_splits=cv_n_fold, random_state=seeds[ikfold], shuffle=True) acc_u80 = [0] * len(qda_regularized) for idx_train, idx_test in kf.split(y_training): X_cv_train, y_cv_train = X_training[idx_train], y_training[ idx_train] X_cv_test, y_cv_test = X_training[idx_test], y_training[idx_test] for model in qda_regularized: model.fit(X_cv_train, y_cv_train) n_test = len(idx_test) for i, test in enumerate(X_cv_test): for im, model in enumerate(qda_regularized): evaluate = model.predict([test]) if y_cv_test[i] in evaluate: acc_u80[im] += (u80(evaluate) / n_test) / cv_n_fold idx_best = np.argmax(acc_u80) logger.info("[1kfold:best_model:seed:u80] (%s, %s, %s, %s)", ikfold, alphas[idx_best], seeds[ikfold], acc_u80) writer.writerow(acc_u80) file_csv.flush() best_model = __factory_model_precise("qda", store_covariance=True, reg_param=alphas[idx_best]) best_model.fit(X_training, y_training) accuracy[ikfold], bn_test, best_alphas[ikfold] = 0, len( idx_testing), alphas[idx_best] for i, test in enumerate(X_testing): evaluate = best_model.predict([test]) if y_testing[i] in evaluate: accuracy[ikfold] += u80(evaluate) / bn_test logger.info("[2kfold:best_model:seed:accuracy] (%s, %s, %s)", ikfold, alphas[idx_best], accuracy[ikfold]) ikfold += 1 file_csv.close() logger.info("[total:data-set:avgResults] (%s, %s, %s, %s)", in_path, np.mean(accuracy), best_alphas, accuracy)
def experiments_binr_vs_imprecise(in_path=None, out_path=None, seed=None, missing_pct=0.0, noise_label_pct=0.0, noise_label_type=-1, noise_label_prob=0.5, nb_kFold=10, nb_process=1, scaling=False, epsilon_rejects=None, min_ell_param=0.5, max_ell_param=6.0, step_ell_param=1.0, remove_features=None, is_resampling=False): """ Experiments with binary relevant imprecise and missing/noise data. :param in_path: :param out_path: :param seed: :param missing_pct: percentage of missing labels :param noise_label_pct: percentage of noise labels :param noise_label_type: type of perturbation noise :param noise_label_prob: probaiblity of noise labesl :param nb_kFold: :param nb_process: number of process in parallel :param scaling: scaling X input space (used for kkn-nccbr classifier) :param epsilon_rejects: epsilon of reject option (for comparing with imprecise version) :param min_ell_param: minimum value of imprecise parameter s :param max_ell_param: maximum value of imprecise parameter s :param step_ell_param: discretization step of parameter s :param remove_features: features not to take into account :param is_resampling: if re-sampling of test and training data sets generated beforehand ...note:: TODO: Bug when the missing percentage is higher (90%) to fix. """ if not is_resampling: assert os.path.exists(in_path), "Without training data, not testing" assert os.path.exists(out_path), "File for putting results does not exist" logger = create_logger("computing_best_imprecise_mean", True) logger.info('Training dataset (%s, %s)', in_path, out_path) logger.info("(min_ncc_s_param, max_ncc_s_param, step_ncc_s_param) (%s, %s, %s)", min_ell_param, max_ell_param, step_ell_param) logger.info("(scaling, remove_features, process, epsilon_rejects) (%s, %s, %s, %s)", scaling, remove_features, nb_process, epsilon_rejects) logger.info("(missing_pct, noise_label_pct, noise_label_type, noise_label_prob) (%s, %s, %s, %s)", missing_pct, noise_label_pct, noise_label_type, noise_label_prob) # Seeding a random value for k-fold top learning-testing data if seed is None: seed = [random.randrange(sys.maxsize) for _ in range(nb_kFold)] logger.debug("[FIRST-STEP-SEED] SEED: %s", seed) # Create a CSV file for saving results file_csv = open(out_path, 'a') writer = csv.writer(file_csv) # Create a CSV file for saving query predictions new instances out_path_partial = out_path[:-4] + "partial.csv" if not os.path.exists(out_path_partial): with open(out_path_partial, 'w'): pass fpartial_csv = open(out_path_partial, 'a') wpartial = csv.writer(fpartial_csv) save_query = save_partial_query_classification(fpartial_csv, wpartial) # instance class classifier manager = ManagerWorkers(nb_process=nb_process, fun_prediction=skeptical_prediction) manager.executeAsync(class_model="classifip.models.mlc.igdabr.IGDA_BR") # c constant for abstained multilabel list_c_spe = [(num + 1) * .05 for num in range(10)] list_c_par = [(num + 1) * .1 for num in range(10)] # metrics performances metrics = MetricsPerformances(do_inference_exact=False, epsilon_rejects=epsilon_rejects, list_constants_spe=list_c_spe, list_constants_par=list_c_par) if not is_resampling: cv10x10fold_br_vs_ibr(logger, manager, metrics, remove_features, scaling, nb_kFold, seed, writer, file_csv, min_ell_param, max_ell_param, step_ell_param, missing_pct, noise_label_pct, noise_label_type, noise_label_prob) else: re_sampling_with_pct_train(logger, manager, metrics, nb_kFold, writer, file_csv, min_ell_param, max_ell_param, step_ell_param, missing_pct, noise_label_pct, noise_label_type, noise_label_prob, save_query) manager.poisonPillTraining() file_csv.close() fpartial_csv.close() logger.debug("Results Final: %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s", metrics.ich_iid_skeptic, metrics.cph_iid_skeptic, metrics.score_hamming, metrics.ich_spe_partial, metrics.cph_spe_partial, metrics.ich_par_partial, metrics.cph_par_partial, metrics.spe_partial_score, metrics.par_partial_score, metrics.ich_reject, metrics.cph_reject)
import numpy as np, random, os, time, sys from classifip.utils import create_logger from CSP_common import * from classifip.models.ncclr import NCCLR from classifip.evaluation.measures import correctness_measure, completeness_measure from classifip.dataset.arff import ArffFile from classifip.evaluation import k_fold_cross_validation import multiprocessing from functools import partial logger = create_logger("computing_best_min_s_cross_validation", True) def parallel_prediction_csp(model, test_data, dataset, evaluatePBOX): idx, pBox = evaluatePBOX predicts = model.inference_CSP([pBox]) y_ground_truth = test_data[idx][-1].split(">") correctness = correctness_measure(y_ground_truth, predicts[0]) completeness = completeness_measure(y_ground_truth, predicts[0]) is_coherent = False # verify if the prediction is coherent pid = multiprocessing.current_process().name def _pinfo(message, kwargs): print("[" + pid + "][" + time.strftime('%x %X %Z') + "]", "-", message % kwargs, flush=True) if predicts[0] is not None: is_coherent = True
def experiments_chaining_imprecise( in_path=None, out_path=None, seed=None, nb_kFold=10, nb_process=1, min_ncc_s_param=0.5, max_ncc_s_param=6.0, step_ncc_s_param=1.0, missing_pct=0.0, noise_label_pct=0.0, noise_label_type=-1, noise_label_prob=0.5, remove_features=None, scaling=False, strategy_chaining=IMLCStrategy.IMPRECISE_BRANCHING, safety_chaining=False): assert os.path.exists(in_path), "Without training data, not testing" assert os.path.exists(out_path), "File for putting results does not exist" logger = create_logger("computing_best_imprecise_mean", True) logger.info('Training dataset (%s, %s)', in_path, out_path) logger.info( "(min_ncc_s_param, max_ncc_s_param, step_ncc_s_param) (%s, %s, %s)", min_ncc_s_param, max_ncc_s_param, step_ncc_s_param) logger.info("(scaling, remove_features, process) (%s, %s, %s)", scaling, remove_features, nb_process) logger.info( "(missing_pct, noise_label_pct, noise_label_type, noise_label_prob) (%s, %s, %s, %s)", missing_pct, noise_label_pct, noise_label_type, noise_label_prob) logger.info("(strategy_chaining, safety_chaining) (%s, %s)", strategy_chaining, safety_chaining) # Seeding a random value for k-fold top learning-testing data if seed is None: seed = [random.randrange(sys.maxsize) for _ in range(nb_kFold)] logger.debug("[FIRST-STEP-SEED] SEED: %s", seed) # Create a CSV file for saving results file_csv = open(out_path, 'a') writer = csv.writer(file_csv) manager = ManagerWorkers(nb_process=nb_process) manager.executeAsync( class_model="classifip.models.mlc.chainncc.MLChaining") ich, cph, acc, acc_trans, avg_sols = dict(), dict(), dict(), dict(), dict() min_discretize, max_discretize = 5, 7 for nb_disc in range(min_discretize, max_discretize): data_learning = arff.ArffFile() data_learning.load(in_path) if remove_features is not None: for r_feature in remove_features: try: data_learning.remove_col(r_feature) except Exception as err: print("Remove feature error: {0}".format(err)) nb_labels = get_nb_labels_class(data_learning) if scaling: normalize(data_learning, n_labels=nb_labels) data_learning.discretize(discmet="eqfreq", numint=nb_disc) for time in range(nb_kFold): # 10-10 times cross-validation logger.info( "Number interval for discreteness and labels (%1d, %1d)." % (nb_disc, nb_labels)) cv_kfold = k_fold_cross_validation(data_learning, nb_kFold, randomise=True, random_seed=seed[time]) splits_s = list([]) for training, testing in cv_kfold: train_clone_data = training.make_clone() test_clone_data = testing.make_clone() MLCNCC.shuffle_labels_train_testing(train_clone_data, test_clone_data, nb_labels=nb_labels) logger.info("Splits %s train %s", len(training.data), training.data[0]) logger.info("Splits %s test %s", len(testing.data), testing.data[0]) splits_s.append((train_clone_data, test_clone_data)) disc = str(nb_disc) + "-" + str(time) ich[disc], cph[disc] = dict(), dict() acc_trans[disc], acc[disc] = dict(), dict() avg_sols[disc] = dict() for s_ncc in np.arange(min_ncc_s_param, max_ncc_s_param, step_ncc_s_param): ks_ncc = str(s_ncc) ich[disc][ks_ncc], cph[disc][ks_ncc] = 0, 0 acc[disc][ks_ncc], acc_trans[disc][ks_ncc] = 0, 0 avg_sols[disc][ks_ncc] = 0 for idx_fold, (training, testing) in enumerate(splits_s): res = computing_training_testing_step( training, testing, nb_labels, s_ncc, manager, strategy_chaining, safety_chaining, missing_pct, noise_label_pct, noise_label_type, noise_label_prob, ich[disc][ks_ncc], cph[disc][ks_ncc], acc[disc][ks_ncc], acc_trans[disc][ks_ncc], avg_sols[disc][ks_ncc]) ich[disc][ks_ncc], cph[disc][ks_ncc] = res[0], res[1] acc[disc][ks_ncc], acc_trans[disc][ks_ncc] = res[2], res[3] avg_sols[disc][ks_ncc] = res[4] logger.debug( "Partial-step-cumulative (acc, ich, acc_trans, avg_sols) (%s, %s, %s, %s)", acc[disc][ks_ncc], ich[disc][ks_ncc], acc_trans[disc][ks_ncc], avg_sols[disc][ks_ncc]) writer.writerow([ str(nb_disc), s_ncc, time, ich[disc][ks_ncc] / nb_kFold, cph[disc][ks_ncc] / nb_kFold, acc[disc][ks_ncc] / nb_kFold, acc_trans[disc][ks_ncc] / nb_kFold, avg_sols[disc][ks_ncc] / nb_kFold ]) file_csv.flush() logger.debug("Partial-s-k_step (%s, %s, %s, %s, %s, %s)", disc, s_ncc, time, ich[disc][ks_ncc] / nb_kFold, cph[disc][ks_ncc] / nb_kFold, acc_trans[disc][ks_ncc] / nb_kFold) manager.poisonPillTraining() file_csv.close() logger.debug("Results Final: %s, %s", ich, cph)
def __init__(self, DEBUG=False): super(MLCNCCExact, self).__init__(DEBUG) self.power_set = [] self.root = None self.DEBUG = DEBUG self._logger = create_logger("MLCNCCExact", DEBUG)