예제 #1
0
def computing_precise_vs_imprecise(in_path=None,
                                   ell_optimal=0.1,
                                   cv_n_fold=10,
                                   seeds=None,
                                   lib_path_server=None,
                                   model_type_precise='lda',
                                   model_type_imprecise='ilda',
                                   scaling=True):
    data = export_data_set('iris.data') if in_path is None else pd.read_csv(
        in_path)
    logger = create_logger("computing_precise_vs_imprecise", True)
    logger.info('Training dataset and models (%s, %s, %s, %s)', in_path,
                model_type_precise, model_type_imprecise, ell_optimal)
    X = data.iloc[:, :-1].values
    if scaling: X = normalize_minmax(X)
    y = np.array(data.iloc[:, -1].tolist())
    seeds = generate_seeds(cv_n_fold) if seeds is None else seeds
    model_impr = __factory_model(model_type_imprecise,
                                 init_matlab=True,
                                 add_path_matlab=lib_path_server,
                                 DEBUG=False)
    model_prec = __factory_model_precise(model_type_precise,
                                         store_covariance=True)
    avg_imprecise, avg_precise, n_real_times = 0, 0, 0
    for time in range(cv_n_fold):
        kf = KFold(n_splits=cv_n_fold, random_state=seeds[time], shuffle=True)
        imprecise_mean, precise_mean, n_real_fold = 0, 0, 0
        for idx_train, idx_test in kf.split(y):
            X_cv_train, y_cv_train = X[idx_train], y[idx_train]
            X_cv_test, y_cv_test = X[idx_test], y[idx_test]
            model_impr.learn(X=X_cv_train, y=y_cv_train, ell=ell_optimal)
            model_prec.fit(X_cv_train, y_cv_train)
            n_real_tests, time_precise, time_imprecise = 0, 0, 0
            n_test, _ = X_cv_test.shape
            for i, test in enumerate(X_cv_test):
                evaluate_imp, _ = model_impr.evaluate(test)
                evaluate = model_prec.predict([test])
                if len(evaluate_imp) > 1:
                    n_real_tests += 1
                    if y_cv_test[i] in evaluate_imp: time_imprecise += 1
                    if y_cv_test[i] in evaluate: time_precise += 1
                logger.debug(
                    "(time, iTest, ellOptimal, cautious, prediction, ground-truth)(%s, %s, %s, %s, %s, %s)",
                    time, i, ell_optimal, evaluate_imp, evaluate, y_cv_test[i])
            logger.debug(
                "(time, ellOptimal, nRealTests, timeImprecise, timePrecise) (%s, %s, %s, %s, %s)",
                time, ell_optimal, n_real_tests, time_imprecise, time_precise)
            if n_real_tests > 0:
                n_real_fold += 1
                imprecise_mean += time_imprecise / n_real_tests
                precise_mean += time_precise / n_real_tests
        logger.debug("(time, nRealFold, imprecise, precise) (%s, %s, %s, %s)",
                     time, n_real_fold, imprecise_mean, precise_mean)
        if n_real_fold > 0:
            n_real_times += 1
            avg_imprecise += imprecise_mean / n_real_fold
            avg_precise += precise_mean / n_real_fold
    logger.debug("(dataset, models, imprec, prec) (%s, %s, %s, %s, %s)",
                 in_path, model_type_imprecise, model_type_precise,
                 avg_imprecise / n_real_times, avg_precise / n_real_times)
예제 #2
0
 def __init__(self,
              solver_matlab=False,
              gda_method="nda",
              add_path_matlab=None,
              DEBUG=False):
     """
     :param solver_matlab: If it is
         true: it create a only classifier to handle m-binary classifier (exact solver matlab)
         false: it create a classifier by binary classifier (approximation solver python)
     :param gda_method: inda, ieda, ilda, iqda
     :param add_path_matlab:
     :param DEBUG:
     """
     super(IGDA_BR, self).__init__(DEBUG)
     self.gda_models = None
     self.nb_feature = None
     self.__solver_matlab = solver_matlab
     self.__igda_name = "i" + gda_method
     self.__gda_name = gda_method
     self._logger = create_logger("IGDA_BR", DEBUG)
     if self.__solver_matlab:
         self._global_gda_imprecise = _factory_igda_model(
             model_type=self.__igda_name,
             solver_matlab=True,
             add_path_matlab=add_path_matlab,
             DEBUG=DEBUG)
예제 #3
0
def computing_outer_vs_exact_inference_random_tree(out_path,
                                                   nb_labels=3,
                                                   nb_repeats=100,
                                                   nb_process=1,
                                                   seed=None,
                                                   min_epsilon_param=0.05,
                                                   max_epsilon_param=0.5,
                                                   step_epsilon_param=0.05):
    assert os.path.exists(out_path), "File for putting results does not exist"

    logger = create_logger("computing_outer_vs_exact_inference_random_tree",
                           True)
    logger.info('Results file (%s)', out_path)
    logger.info("(nb_repeats, nb_process, nb_labels) (%s, %s, %s)", nb_repeats,
                nb_process, nb_labels)
    logger.info(
        "(min_epsilon_param, max_epsilon_param, step_epsilon_param) (%s, %s, %s)",
        min_epsilon_param, max_epsilon_param, step_epsilon_param)
    if seed is None:
        seed = random.randrange(pow(2, 20))
    random.seed(seed)
    logger.debug("[FIRST-STEP-SEED] SEED: %s", seed)

    # Create a CSV file for saving results
    file_csv = open(out_path, 'a')
    writer = csv.writer(file_csv)

    # Create a CSV file for saving time prediction
    out_path_partial = out_path[:-4] + "_time.csv"
    if not os.path.exists(out_path_partial):
        with open(out_path_partial, 'w'):
            pass
    f_time_csv = open(out_path_partial, 'a')
    writer_time = csv.writer(f_time_csv)

    POOL = multiprocessing.Pool(processes=nb_process)
    for epsilon in np.arange(min_epsilon_param, max_epsilon_param,
                             step_epsilon_param):
        target_function = partial(parallel_inferences,
                                  nb_labels=nb_labels,
                                  epsilon=epsilon)
        set_distance_cardinal = POOL.map(target_function, range(nb_repeats))
        set_distance_cardinal = np.array(set_distance_cardinal)
        # writing distance outer vs exact procedure
        writer.writerow(np.hstack((epsilon, set_distance_cardinal[:, 0])))
        file_csv.flush()
        logger.info("Partial-s-k_step (%s, %s)", str(epsilon),
                    sum(set_distance_cardinal[:, 0]) / nb_repeats)
        # writing time naive vs exact procedure
        writer_time.writerow(
            np.hstack((epsilon, "exact", set_distance_cardinal[:, 1])))
        writer_time.writerow(
            np.hstack((epsilon, "naive", set_distance_cardinal[:, 2])))
        f_time_csv.flush()
        logger.info("Partial-avg-time (%s, %s)", str(epsilon),
                    np.mean(set_distance_cardinal[:, 1:3], axis=0))

    file_csv.close()
    logger.info("Results Final")
예제 #4
0
def performance_accuracy_noise_corrupted_test_data(in_train_paths=None,
                                                   in_tests_paths=None,
                                                   model_type_precise='lda',
                                                   model_type_imprecise='ilda',
                                                   ell_optimal=0.1,
                                                   scaling=False,
                                                   lib_path_server=None,
                                                   nb_process=10):
    assert isinstance(in_train_paths,
                      list), "Without training data, cannot create to model"
    assert isinstance(
        in_tests_paths,
        list), "Without training data, cannot performing accuracy"

    logger = create_logger("performance_accuracy_noise_corrupted_test_data",
                           True)
    logger.info('Training dataset (%s, %s, %s)', in_train_paths,
                model_type_imprecise, ell_optimal)

    manager = ManagerWorkers(nb_process=nb_process)
    manager.executeAsync(model_type_imprecise, lib_path_server)
    versus = model_type_imprecise + "_vs_" + model_type_precise
    file_csv = open("results_" + versus + "_noise_accuracy.csv", 'w')
    writer = csv.writer(file_csv)
    model_precise = __factory_model_precise(model_type_precise,
                                            store_covariance=True)
    for in_train_path in in_train_paths:
        X_train, y_train = dataset_to_Xy(in_train_path, scaling=scaling)
        model_precise.fit(X_train, y_train)
        accuracies = dict({})
        for in_test_path in in_tests_paths:
            X_test, y_test = dataset_to_Xy(in_test_path, scaling=scaling)
            _u65, _u80, _set = computing_training_testing_step(
                X_train, y_train, X_test, y_test, ell_optimal, manager, 0, 0,
                0)
            evaluate = model_precise.predict(X_test)
            _acc = sum(
                1 for k, j in zip(evaluate, y_test) if k == j) / len(y_test)
            logger.debug("accuracy-in_test_path (%s, %s, %s, %s, %s, %s)",
                         ntpath.basename(in_train_path),
                         ntpath.basename(in_test_path), ell_optimal, _u65,
                         _u80, _acc)
            accuracies[ntpath.basename(in_test_path)] = [
                ell_optimal, _u65, _u80, _set, _acc
            ]
            writer.writerow([
                ntpath.basename(in_train_path),
                ntpath.basename(in_test_path), ell_optimal, _u65, _u80, _set,
                _acc
            ])
            file_csv.flush()
        logger.debug("Partial-finish-accuracy-noise-corrupted_test %s: %s",
                     ntpath.basename(in_train_path), accuracies)
    manager.poisonPillTraining()
    file_csv.close()
    logger.debug("Finish-accuracy-noise-corrupted_test")
예제 #5
0
def computing_best_imprecise_mean(in_path=None, out_path=None, cv_nfold=10, model_type="ieda", test_size=0.4,
                                  from_ell=0.1, to_ell=1.0, by_ell=0.1, seed=None, lib_path_server=None, scaling=False):
    assert os.path.exists(in_path), "Without training data, not testing"
    assert os.path.exists(out_path), "File for putting results does not exist"

    logger = create_logger("computing_best_imprecise_mean", True)
    logger.info('Training dataset %s', in_path)
    data = pd.read_csv(in_path)  # , header=None)
    X = data.iloc[:, :-1].values
    if scaling: X = normalize_minmax(X)
    y = np.array(data.iloc[:, -1].tolist())

    ell_u65, ell_u80 = dict(), dict()
    seed = random.randrange(pow(2, 30)) if seed is None else seed
    logger.debug("MODEL: %s, SEED: %s", model_type, seed)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=seed)
    kf = KFold(n_splits=cv_nfold, random_state=None, shuffle=True)
    splits = list([])
    for idx_train, idx_test in kf.split(y_train):
        splits.append((idx_train, idx_test))
        logger.info("Splits %s train %s", len(splits), idx_train)
        logger.info("Splits %s test %s", len(splits), idx_test)

    # Create a CSV file for saving results
    file_csv = open(out_path, 'a')
    writer = csv.writer(file_csv)

    model = __factory_model(model_type, solver_matlab=True, add_path_matlab=lib_path_server, DEBUG=True)
    for ell_current in np.arange(from_ell, to_ell, by_ell):
        ell_u65[ell_current], ell_u80[ell_current] = 0, 0
        logger.info("ELL_CURRENT %s", ell_current)
        for idx_train, idx_test in splits:
            logger.info("Splits train %s", idx_train)
            logger.info("Splits test %s", idx_test)
            X_cv_train, y_cv_train = X_train[idx_train], y_train[idx_train]
            X_cv_test, y_cv_test = X_train[idx_test], y_train[idx_test]
            model.learn(X=X_cv_train, y=y_cv_train, ell=ell_current)
            sum_u65, sum_u80 = 0, 0
            n_test = len(idx_test)
            for i, test in enumerate(X_cv_test):
                evaluate = model.evaluate(test)
                logger.debug("(testing, ell_current, prediction, ground-truth) (%s, %s, %s, %s)",
                             i, ell_current, evaluate, y_cv_test[i])
                if y_cv_test[i] in evaluate:
                    sum_u65 += u65(evaluate)
                    sum_u80 += u80(evaluate)
            ell_u65[ell_current] += sum_u65 / n_test
            ell_u80[ell_current] += sum_u80 / n_test
            logger.debug("Partial-kfold (%s, %s, %s)", ell_current, ell_u65[ell_current], ell_u80[ell_current])
        ell_u65[ell_current] = ell_u65[ell_current] / cv_nfold
        ell_u80[ell_current] = ell_u80[ell_current] / cv_nfold
        writer.writerow([ell_current, ell_u65[ell_current], ell_u80[ell_current]])
        file_csv.flush()
        logger.debug("Partial-ell (%s, %s, %s)", ell_current, ell_u65, ell_u80)
    file_csv.close()
    logger.debug("Total-ell %s %s %s", in_path, ell_u65, ell_u80)
예제 #6
0
 def __init__(self, DEBUG=False):
     self.feature_names = []
     self.label_names = []
     self.feature_values = dict()
     self.feature_count = dict()
     self.label_counts = []
     self.nb_labels = 0
     self.training_size = 0
     self.marginal_props = None
     self.DEBUG = DEBUG
     self.has_imprecise_marginal = False
     self._logger = create_logger("MLCNCC", DEBUG)
예제 #7
0
def performance_hold_out(in_path=None,
                         out_path=None,
                         model_type='lda',
                         test_pct=0.4,
                         n_times=10,
                         seeds=None,
                         scaling=False):
    assert os.path.exists(in_path), "Without training data, not testing"
    assert os.path.exists(out_path), "Without output saving performance"

    logger = create_logger("performance_hold_out", True)
    logger.info('Training data set %s, test percentage %s, model_type %s',
                in_path, test_pct, model_type)

    data = pd.read_csv(in_path, header=None)
    X = data.iloc[:, :-1].values
    if scaling: X = normalize_minmax(X)
    y = data.iloc[:, -1].tolist()

    seeds = generate_seeds(n_times) if seeds is None else seeds
    logger.info('Seeds generated %s', seeds)

    file_csv = open(out_path, 'w')
    writer = csv.writer(file_csv)

    model = __factory_model_precise(model_type, store_covariance=True)
    mean_u65, mean_u80 = np.array([]), np.array([])
    for i in range(0, n_times):
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=test_pct, random_state=seeds[i])
        sum_u65, sum_u80 = 0, 0
        model.fit(X_train, y_train)
        n, _ = X_test.shape
        for j, test in enumerate(X_test):
            evaluate = model.predict([test])
            if y_test[j] in evaluate:
                sum_u65 += u65(evaluate)
                sum_u80 += u80(evaluate)
        logger.info("time, u65, u80 (%s, %s, %s)", i, sum_u65 / n, sum_u80 / n)
        mean_u65 = np.append(mean_u65, sum_u65 / n)
        mean_u80 = np.append(mean_u80, sum_u80 / n)
        writer.writerow([-999, i, mean_u65[i], mean_u80[i]])
        file_csv.flush()
    file_csv.close()
    logger.info("[total:data-set:avgResults] (%s, %s)", np.mean(mean_u65),
                np.mean(mean_u80))
예제 #8
0
def performance_accuracy_hold_out(in_path=None,
                                  model_type="ilda",
                                  ell_optimal=0.1,
                                  lib_path_server=None,
                                  seeds=None,
                                  DEBUG=False,
                                  scaling=False):
    assert os.path.exists(
        in_path
    ), "Without training data, cannot performing cross hold-out accuracy"
    logger = create_logger("performance_accuracy_hold_out", True)
    logger.info('Training dataset (%s, %s, %s)', in_path, model_type,
                ell_optimal)
    X, y = dataset_to_Xy(in_path, scaling=scaling)

    seeds = generate_seeds(cv_n_fold) if seeds is None else seeds
    logger.info('Seeds used for accuracy %s', seeds)
    n_time = len(seeds)
    mean_u65, mean_u80 = 0, 0
    model = __factory_model(model_type,
                            solver_matlab=True,
                            add_path_matlab=lib_path_server,
                            DEBUG=DEBUG)
    for k in range(0, n_time):
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=0.4, random_state=seeds[k])
        model.learn(X=X_cv_train, y=y_cv_train, ell=ell_optimal)
        sum_u65, sum_u80 = 0, 0
        n_test, _ = X_test.shape
        for i, test in enumerate(X_test):
            evaluate = lqa.evaluate(test)
            logger.debug(
                "(testing, ell_current, prediction, ground-truth) (%s, %s, %s, %s)",
                i, ell_optimal, evaluate, y_test[i])
            if y_test[i] in evaluate:
                sum_u65 += u65(evaluate)
                sum_u80 += u80(evaluate)
        logger.debug("Partial-kfold (%s, %s, %s, %s)", ell_current, k,
                     sum_u65 / n_test, sum_u80 / n_test)
        mean_u65 += sum_u65 / n_test
        mean_u80 += sum_u80 / n_test
    mean_u65 = mean_u65 / n_time
    mean_u80 = mean_u80 / n_time
    logger.debug("Total-ell (%s, %s, %s, %s)", in_path, ell_optimal, mean_u65,
                 mean_u80)
예제 #9
0
def performance_cv_accuracy(in_path=None,
                            model_type='lda',
                            cv_n_fold=10,
                            seeds=None,
                            scaling=False):
    assert os.path.exists(in_path), "Without training data, not testing"
    data = pd.read_csv(in_path, header=None)
    logger = create_logger("performance_cv_accuracy", True)
    logger.info('Training data set %s, cv_n_fold %s, model_type %s', in_path,
                cv_n_fold, model_type)
    X = data.iloc[:, :-1].values
    if scaling: X = normalize_minmax(X)
    y = np.array(data.iloc[:, -1].tolist())
    avg_u65, avg_u80 = 0, 0
    seeds = generate_seeds(cv_n_fold) if seeds is None else seeds
    logger.info('Seeds generated %s', seeds)
    for time in range(cv_n_fold):
        # Generation a random k-fold validation.
        kf = KFold(n_splits=cv_n_fold, random_state=seeds[time], shuffle=True)
        model = __factory_model_precise(model_type, store_covariance=True)
        mean_u65, mean_u80 = 0, 0
        for idx_train, idx_test in kf.split(y):
            X_cv_train, y_cv_train = X[idx_train], y[idx_train]
            X_cv_test, y_cv_test = X[idx_test], y[idx_test]
            model.fit(X_cv_train, y_cv_train)
            n_test = len(idx_test)
            sum_u65, sum_u80 = 0, 0
            for i, test in enumerate(X_cv_test):
                evaluate = model.predict([test])
                logger.debug(
                    "(testing, prediction, ground-truth) (%s, %s, %s)", i,
                    evaluate, y_cv_test[i])
                if y_cv_test[i] in evaluate:
                    sum_u65 += u65(evaluate)
                    sum_u80 += u80(evaluate)
            mean_u65 += sum_u65 / n_test
            mean_u80 += sum_u80 / n_test
        logger.info("Time, seed, u65, u80 (%s, %s, %s, %s)", time, seeds[time],
                    mean_u65 / cv_n_fold, mean_u80 / cv_n_fold)
        avg_u65 += mean_u65 / cv_n_fold
        avg_u80 += mean_u80 / cv_n_fold
    logger.info("[Total:data-set:avgResults] (%s, %s,  %s)", in_path,
                avg_u65 / cv_n_fold, avg_u80 / cv_n_fold)
def performance_cv_accuracy_imprecise(in_path=None,
                                      model_type="ilda",
                                      ell_optimal=0.1,
                                      nb_process=2,
                                      lib_path_server=None,
                                      cv_n_fold=10,
                                      seeds=None,
                                      criterion="maximality"):
    assert os.path.exists(in_path), "Without training data, not testing"
    data = pd.read_csv(in_path)
    logger = create_logger("performance_cv_accuracy_imprecise", True)
    logger.info('Training dataset (%s, %s, %s, %s)', in_path, model_type,
                ell_optimal, criterion)
    X = data.iloc[:, :-1].values
    y = np.array(data.iloc[:, -1].tolist())
    avg_u65, avg_u80 = 0, 0
    seeds = generate_seeds(cv_n_fold) if seeds is None else seeds
    logger.info('Seeds used for accuracy %s', seeds)
    manager = ManagerWorkers(nb_process=nb_process, criterion=criterion)
    manager.executeAsync(model_type, lib_path_server)
    for time in range(cv_n_fold):
        kf = KFold(n_splits=cv_n_fold, random_state=seeds[time], shuffle=True)
        mean_u65, mean_u80 = 0, 0
        for idx_train, idx_test in kf.split(y):
            logger.info("Splits train %s", idx_train)
            logger.info("Splits test %s", idx_test)
            X_cv_train, y_cv_train = X[idx_train], y[idx_train]
            X_cv_test, y_cv_test = X[idx_test], y[idx_test]
            mean_u65, mean_u80 = computing_training_testing_step(
                X_cv_train, y_cv_train, X_cv_test, y_cv_test, ell_optimal,
                manager, mean_u65, mean_u80)
            logger.debug("Partial-kfold (%s, %s, %s, %s)", ell_optimal, time,
                         mean_u65, mean_u80)
        logger.info("Time, seed, u65, u80 (%s, %s, %s, %s)", time, seeds[time],
                    mean_u65 / cv_n_fold, mean_u80 / cv_n_fold)
        avg_u65 += mean_u65 / cv_n_fold
        avg_u80 += mean_u80 / cv_n_fold
    manager.poisonPillTraining()
    logger.debug("total-ell (%s, %s, %s, %s)", in_path, ell_optimal,
                 avg_u65 / cv_n_fold, avg_u80 / cv_n_fold)
예제 #11
0
def computing_outer_vs_exact_ranking_random_tree(out_path,
                                                 nb_labels=3,
                                                 nb_repeats=100,
                                                 nb_process=1,
                                                 seed=None,
                                                 min_epsilon_param=0.05,
                                                 max_epsilon_param=0.50,
                                                 step_epsilon_param=0.05):
    assert os.path.exists(out_path), "File for putting results does not exist"

    logger = create_logger("computing_outer_vs_exact_inference_random_tree",
                           True)
    logger.info('Results file (%s)', out_path)
    logger.info("(nb_repeats, nb_process, nb_labels) (%s, %s, %s)", nb_repeats,
                nb_process, nb_labels)
    logger.info(
        "(min_epsilon_param, max_epsilon_param, step_epsilon_param) (%s, %s, %s)",
        min_epsilon_param, max_epsilon_param, step_epsilon_param)
    if seed is None:
        seed = random.randrange(pow(2, 20))
    random.seed(seed)
    logger.debug("[FIRST-STEP-SEED] SEED: %s", seed)

    # Create a CSV file for saving results
    file_csv = open(out_path, 'a')
    writer = csv.writer(file_csv)

    POOL = multiprocessing.Pool(processes=nb_process)
    for epsilon in np.arange(min_epsilon_param, max_epsilon_param,
                             step_epsilon_param):
        target_function = partial(parallel_inferences,
                                  nb_labels=nb_labels,
                                  epsilon=epsilon)
        set_distance_cardinal = POOL.map(target_function, range(nb_repeats))
        writer.writerow(np.hstack((epsilon, set_distance_cardinal)))
        file_csv.flush()
        logger.info("Partial-s-k_step (%s, %s)", str(epsilon),
                    sum(set_distance_cardinal) / nb_repeats)
    file_csv.close()
    logger.info("Results Final")
예제 #12
0
def computing_time_prediction(in_path=None,
                              ell_optimal=0.1,
                              lib_path_server=None,
                              model_type="ilda",
                              criterion="maximality",
                              k_repetition=10,
                              seeds=None):
    assert os.path.exists(in_path), "Without training data, not testing"
    data = pd.read_csv(in_path, header=None)
    logger = create_logger("computing_time_prediction", True)
    X = data.iloc[:, :-1].values
    y = data.iloc[:, -1].tolist()
    seeds = generate_seeds(k_repetition) if seeds is None else seeds
    logger.info(
        'Training dataset %s with maximality version (%s) and model (%s), ell_optimal (%s) and seeds %s',
        in_path, criterion, model_type, ell_optimal, seeds)
    model = __factory_model(model_type,
                            solver_matlab=True,
                            add_path_matlab=lib_path_server,
                            DEBUG=False)
    avg = np.array([])
    for k in range(k_repetition):
        logger.info("%s-fold repetition randomly, seed %s", k, seeds[k])
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=0.2, random_state=seeds[k])
        model.learn(X=X_train, y=y_train, ell=ell_optimal)
        n, _ = X_test.shape
        sum_time = 0
        for i, test in enumerate(X_test):
            start = time.time()
            evaluate = model.evaluate(test, criterion=criterion)
            end = time.time()
            logger.info("Evaluate %s, Ground-truth %s, Time %s ", evaluate,
                        y_test[i], (end - start))
            sum_time += (end - start)
        avg = np.append(avg, sum_time / n)
    logger.info("Total time (%s, %s) and average %s and sd %s of %s testing",
                in_path, avg, np.mean(avg), np.std(avg), n)
예제 #13
0
def performance_cv_accuracy_imprecise(in_path=None,
                                      model_type="ilda",
                                      ell_optimal=0.1,
                                      scaling=False,
                                      lib_path_server=None,
                                      cv_n_fold=10,
                                      seeds=None,
                                      nb_process=10):
    assert os.path.exists(
        in_path
    ), "Without training data, cannot performing cross validation accuracy"
    logger = create_logger("performance_cv_accuracy_imprecise", True)
    logger.info('Training dataset (%s, %s, %s)', in_path, model_type,
                ell_optimal)
    X, y = dataset_to_Xy(in_path, scaling=scaling)

    avg_u65, avg_u80 = 0, 0
    seeds = generate_seeds(cv_n_fold) if seeds is None else seeds
    logger.info('Seeds used for accuracy %s', seeds)

    manager = ManagerWorkers(nb_process=nb_process)
    manager.executeAsync(model_type, lib_path_server)
    for time in range(cv_n_fold):
        kf = KFold(n_splits=cv_n_fold, random_state=seeds[time], shuffle=True)
        mean_u65, mean_u80 = 0, 0
        for idx_train, idx_test in kf.split(y):
            mean_u65, mean_u80, _ = computing_training_testing_step(
                X[idx_train], y[idx_train], X[idx_test], y[idx_test],
                ell_optimal, manager, mean_u65, mean_u80)
            logger.debug("Partial-kfold (%s, %s, %s, %s)", ell_optimal, time,
                         mean_u65, mean_u80)
        logger.info("Time, seed, u65, u80 (%s, %s, %s, %s)", time, seeds[time],
                    mean_u65 / cv_n_fold, mean_u80 / cv_n_fold)
        avg_u65 += mean_u65 / cv_n_fold
        avg_u80 += mean_u80 / cv_n_fold
    manager.poisonPillTraining()
    logger.debug("Total-ell (%s, %s, %s, %s)", in_path, ell_optimal,
                 avg_u65 / cv_n_fold, avg_u80 / cv_n_fold)
예제 #14
0
def computing_best_imprecise_mean(in_path=None,
                                  out_path=None,
                                  seed=None,
                                  nb_kFold=10,
                                  nb_process=1,
                                  scaling=True,
                                  max_ncc_s_param=5,
                                  remove_features=None):
    assert os.path.exists(in_path), "Without training data, not testing"
    assert os.path.exists(out_path), "File for putting results does not exist"

    logger = create_logger("computing_best_imprecise_mean_cv", True)
    logger.info('Training dataset (%s, %s)', in_path, out_path)

    # Seeding a random value for k-fold top learning-testing data
    if seed is not None: random.seed(seed)
    seed = [random.randrange(sys.maxsize) for _ in range(nb_kFold)]
    logger.debug("[FIRST-STEP-SEED] SEED: %s", seed)

    # Create a CSV file for saving results
    file_csv = open(out_path, 'a')
    writer = csv.writer(file_csv)
    manager = ManagerWorkers(nb_process=nb_process)
    manager.executeAsync(class_model="classifip.models.mlcncc.MLCNCC")

    ich, cph = dict(), dict()
    min_discretize, max_discretize = 5, 9
    for nb_disc in range(min_discretize, max_discretize):
        data_learning = arff.ArffFile()
        data_learning.load(in_path)
        if remove_features is not None:
            for r_feature in remove_features:
                data_learning.remove_col(r_feature)
        nb_labels = get_nb_labels_class(data_learning)
        if scaling: normalize(data_learning, n_labels=nb_labels)
        data_learning.discretize(discmet="eqfreq", numint=nb_disc)

        for time in range(nb_kFold):  # 10-10 times cross-validation
            logger.info(
                "Number interval for discreteness and labels (%1d, %1d)." %
                (nb_disc, nb_labels))
            cv_kfold = k_fold_cross_validation(data_learning,
                                               nb_kFold,
                                               randomise=True,
                                               random_seed=seed[time])

            splits_s = list([])
            for training, testing in cv_kfold:
                splits_s.append((training, testing))
                logger.info("Splits %s train %s", len(training.data),
                            training.data[0])
                logger.info("Splits %s test %s", len(testing.data),
                            testing.data[0])

            disc = str(nb_disc) + "-" + str(time)
            ich[disc], cph[disc] = dict(), dict()
            for s_ncc in np.arange(0.1, max_ncc_s_param + 1, 1):
                ks_ncc = str(s_ncc)
                ich[disc][ks_ncc], cph[disc][ks_ncc] = 0, 0
                for idx_fold, (training, testing) in enumerate(splits_s):
                    ich[disc][ks_ncc], cph[disc][
                        ks_ncc] = computing_training_testing_step(
                            training, testing, nb_labels, s_ncc, manager,
                            ich[disc][ks_ncc], cph[disc][ks_ncc])

                writer.writerow([
                    str(nb_disc), s_ncc, time, ich[disc][ks_ncc] / nb_kFold,
                    cph[disc][ks_ncc] / nb_kFold
                ])
                file_csv.flush()
                logger.debug("Partial-s-k_step (%s, %s, %s, %s, %s)", disc,
                             s_ncc, time, ich[disc][ks_ncc] / nb_kFold,
                             cph[disc][ks_ncc] / nb_kFold)
    manager.poisonPillTraining()
    file_csv.close()
    logger.debug("Results Final: %s, %s", ich, cph)
예제 #15
0
class MLCNCC(metaclass=abc.ABCMeta):
    # global static variables
    LABEL_PARTIAL_VALUE = -1
    logger_global = create_logger('MLCNCC_GLOBAL', True)
    """
        NCCBR implements the naive credal classification method using the IDM for
            multilabel classification with binary relevance.
            
        Base classifier NCC based on [#zaffalon2002]_ and on the improvement 
        proposed by [#corani2010]_

        :param feature_count: store counts of couples label/feature
        :type feature_count: dictionnary with keys label/feature
        :param label_counts: store counts of class labels (to instanciate prior)
        :type label_counts: list
        :param feature_names: store the names of features
        :type feature_names: list
        :param feature_values: store modalities of features
        :type feature_values: dictionary associating each feature name to a list

    """
    def __init__(self, DEBUG=False):
        self.feature_names = []
        self.label_names = []
        self.feature_values = dict()
        self.feature_count = dict()
        self.label_counts = []
        self.nb_labels = 0
        self.training_size = 0
        self.marginal_props = None
        self.DEBUG = DEBUG
        self.has_imprecise_marginal = False
        self._logger = create_logger("MLCNCC", DEBUG)

    def learn(self, learn_data_set, nb_labels):
        """learn the NCC for each label, mainly storing counts of feature/label pairs

        :param learn_data_set: learning instances
        :type learn_data_set: :class:`~classifip.dataset.arff.ArffFile`
        :param nb_labels: number of labels
        :type nb_labels: integer
        """
        self.__init__()
        self.nb_labels = nb_labels

        # Initializing the counts
        self.feature_names = learn_data_set.attributes[:-self.nb_labels]
        self.label_names = np.array(
            learn_data_set.attributes[-self.nb_labels:])

        self.feature_values = learn_data_set.attribute_data.copy()
        # computing precise marginal P(Y) count
        self.marginal_props = dict({i: dict() for i in range(self.nb_labels)})

        for label_index, label_value in enumerate(self.label_names):
            # recovery count of class 1 and 0
            label_set_one = learn_data_set.select_col_vals(label_value, ['1'])
            label_set_zero = learn_data_set.select_col_vals(label_value, ['0'])
            nb_count_one, nb_count_zero = len(label_set_one.data), len(
                label_set_zero.data)
            # if we works with missing label (label=-1: missing), the marginal values changes
            # (1) Computing label proportions
            self.marginal_props[label_index][0] = nb_count_zero
            self.marginal_props[label_index][1] = nb_count_one
            self.marginal_props[label_index]['all'] = float(nb_count_one +
                                                            nb_count_zero)
            # (2) Computing counting label|attributes
            for feature in self.feature_names:
                count_vector_one, count_vector_zero = [], []
                feature_index = learn_data_set.attributes.index(feature)
                for feature_value in learn_data_set.attribute_data[feature]:
                    nb_items_one = [
                        row[feature_index] for row in label_set_one.data
                    ].count(feature_value)
                    count_vector_one.append(nb_items_one)
                    nb_items_zero = [
                        row[feature_index] for row in label_set_zero.data
                    ].count(feature_value)
                    count_vector_zero.append(nb_items_zero)
                self.feature_count[label_value + '|in|' +
                                   feature] = count_vector_one
                self.feature_count[label_value + '|out|' +
                                   feature] = count_vector_zero
            # (3) Computing counting label|other_labels
            for label_feature in self.label_names:
                if label_feature != label_value:
                    label_feature_index = learn_data_set.attributes.index(
                        label_feature)
                    count_vector_one, count_vector_zero = [], []
                    for label_feature_value in learn_data_set.attribute_data[
                            label_feature]:
                        nb_items_one = [
                            row[label_feature_index]
                            for row in label_set_one.data
                        ].count(label_feature_value)
                        count_vector_one.append(nb_items_one)
                        nb_items_zero = [
                            row[label_feature_index]
                            for row in label_set_zero.data
                        ].count(label_feature_value)
                        count_vector_zero.append(nb_items_zero)
                    self.feature_count[label_value + '|in|' +
                                       label_feature] = count_vector_one
                    self.feature_count[label_value + '|out|' +
                                       label_feature] = count_vector_zero

    @abc.abstractmethod
    def evaluate(self,
                 test_dataset,
                 ncc_epsilon=0.001,
                 ncc_s_param=2.0,
                 with_imprecise_marginal=False,
                 precision=None):
        pass

    @staticmethod
    def __random_set_labels_index(dataset, nb_labels, seed_random_label=None):
        """
        :param dataset:
        :param seed_random_label:
        :return:
        """
        # Generation random position for chain label
        if seed_random_label is None:
            seed_random_label = random.randrange(pow(2, 20))

        MLCNCC.logger_global.info(
            "[__random_set_labels_index] seed random label (%s)",
            seed_random_label)
        label_names = np.array(dataset.attributes[-nb_labels:])
        origin_indices = dict(zip(label_names, range(nb_labels)))
        np.random.seed(seed_random_label)
        np.random.shuffle(label_names)
        MLCNCC.logger_global.info(
            "[__random_set_labels_index] origin index (%s)", origin_indices)
        MLCNCC.logger_global.info(
            "[__random_set_labels_index] shuffle labels (%s)", label_names)
        return origin_indices, label_names

    @staticmethod
    def shuffle_labels(dataset, nb_labels, seed_random_label=None):
        """
        :param dataset: (mutable)
        :type  classifip.dataset.arff.ArffFile
            with string values for columns (after discretization data)
            (warning: does not work with mixed value (float,string))
        :param nb_labels:
        :param seed_random_label: randomly mixing labels Y1, Y2, ..., Ym
        :type seed_random_label: float
        :return: <void> modify structure of dataset parameter
        """
        nb_cols = len(dataset.attributes)
        origin_indices, label_names = MLCNCC.__random_set_labels_index(
            dataset, nb_labels, seed_random_label)
        np_data = np.array(dataset.data)
        new_data_labels = np.empty((len(dataset.data), nb_labels), dtype='<U1')
        for index, label in enumerate(label_names):
            orig_idx = origin_indices[label]
            new_data_labels[:, index] = np.array(np_data[:, nb_cols -
                                                         nb_labels + orig_idx])
            dataset.attributes[nb_cols - nb_labels + index] = label
        np_data[:, -nb_labels:] = new_data_labels
        dataset.data = np_data.tolist()

    @staticmethod
    def shuffle_labels_train_testing(train_dataset,
                                     testing_dataset,
                                     nb_labels,
                                     seed_random_label=None):
        """
        :param train_dataset: (mutable)
        :type  classifip.dataset.arff.ArffFile
        :param testing_dataset: (mutable)
        :type  classifip.dataset.arff.ArffFile
        :param nb_labels:
        :param seed_random_label:
        :return:
        """
        nb_cols = len(train_dataset.attributes)
        origin_indices, label_names = MLCNCC.__random_set_labels_index(
            train_dataset, nb_labels, seed_random_label)

        np_data_train = np.array(train_dataset.data)
        np_data_test = np.array(testing_dataset.data)
        new_ltrain = np.empty((len(train_dataset.data), nb_labels),
                              dtype='<U1')
        new_ltest = np.empty((len(testing_dataset.data), nb_labels),
                             dtype='<U1')
        for index, label in enumerate(label_names):
            orig_idx = origin_indices[label]
            # exchange columns training dataset
            new_ltrain[:, index] = np.array(
                np_data_train[:, nb_cols - nb_labels + orig_idx])
            train_dataset.attributes[nb_cols - nb_labels + index] = label
            # exchange columns testing dataset
            new_ltest[:, index] = np.array(np_data_test[:, nb_cols -
                                                        nb_labels + orig_idx])
            train_dataset.attributes[nb_cols - nb_labels + index] = label

        np_data_train[:, -nb_labels:] = new_ltrain
        np_data_test[:, -nb_labels:] = new_ltest
        train_dataset.data = np_data_train.tolist()
        testing_dataset.data = np_data_test.tolist()

    @staticmethod
    def missing_labels_learn_data_set(learn_data_set,
                                      nb_labels,
                                      missing_pct=0.0):
        """
        :param learn_data_set:
        :type learn_data_set: arff
        :param nb_labels: number of labels
        :type nb_labels: integer
        :param missing_pct: percentage of missing labels
        :type missing_pct: float
        :return:
        """
        if missing_pct < 0.0 or missing_pct > 1.0:
            raise Exception(
                'Negative percentage or higher than one of missing label.')
        if missing_pct > 0.0:
            label_names = learn_data_set.attributes[-nb_labels:]
            for label_value in label_names:
                missing_label_index = np.random.choice(
                    len(learn_data_set.data),
                    int(len(learn_data_set.data) * missing_pct),
                    replace=False)
                col_ind = learn_data_set.attributes.index(label_value)
                for index, value in enumerate(learn_data_set.data):
                    if index in missing_label_index:
                        value[col_ind] = '-1'

    @staticmethod
    def noise_labels_learn_data_set(learn_data_set, nb_labels, noise_label_pct,
                                    noise_label_type, noise_label_prob):
        """
        :param learn_data_set:
        :type learn_data_set: arff
        :param nb_labels: number of labels
        :type nb_labels: integer
        :param noise_label_pct: percentage noise labels
        :type noise_label_pct: float
        :param noise_label_type: type of noise label flipping
            (1) reverse change 1-0
            (2) with probability p label relevant 1 (bernoulli trials)
            (3) label relevant 1 with probability greater than p (uniform randomly)
        :type noise_label_type: integer
        :param noise_label_prob: probability to flip a label
        :type noise_label_prob: float
        """
        if noise_label_type not in [1, 2, 3, -1]:
            raise Exception(
                'Configuration noise label is not implemented yet.')
        if noise_label_pct < 0.0 or noise_label_pct > 1.0:
            raise Exception(
                'Negative percentage or higher than one of noise label.')

        if noise_label_pct > 0.0 and noise_label_type in [1, 2, 3]:
            size_learn_data = len(learn_data_set.data)
            set_label_index = np.zeros((size_learn_data, nb_labels), dtype=int)
            for i in range(nb_labels):
                noise_index_by_label = np.random.choice(size_learn_data,
                                                        int(size_learn_data *
                                                            noise_label_pct),
                                                        replace=False)
                if noise_label_type == 1:
                    set_label_index[noise_index_by_label, i] = 1
                elif noise_label_type == 2:
                    noise_label_flip = np.random.choice(
                        [0, 1],
                        size=int(size_learn_data * noise_label_pct),
                        p=[1 - noise_label_prob, noise_label_prob])
                    set_label_index[noise_index_by_label,
                                    i] = 3 - noise_label_flip  # 2:=1 and 3:=0
                elif noise_label_type == 3:
                    noise_uniform_rand = np.random.uniform(
                        size=int(size_learn_data * noise_label_pct))
                    noise_uniform_rand[
                        noise_uniform_rand >= noise_label_prob] = 1
                    noise_uniform_rand[
                        noise_uniform_rand < noise_label_prob] = 0
                    set_label_index[
                        noise_index_by_label,
                        i] = 3 - noise_uniform_rand  # 2:=1 and 3:=0

            if noise_label_type == 1:
                for i, instance in enumerate(learn_data_set.data):
                    noise_label_by_inst = abs(
                        set_label_index[i, :] -
                        np.array(instance[-nb_labels:], dtype=int))
                    instance[-nb_labels:] = noise_label_by_inst.astype(
                        '<U1').tolist()
            elif noise_label_type == 2 or noise_label_type == 3:
                for i, instance in enumerate(learn_data_set.data):
                    idx_zero = np.where(set_label_index[i, :] == 3)
                    idx_one = np.where(set_label_index[i, :] == 2)
                    noise_labels_value = np.array(instance[-nb_labels:],
                                                  dtype=int)
                    noise_labels_value[idx_zero] = 0
                    noise_labels_value[idx_one] = 1
                    instance[-nb_labels:] = noise_labels_value.astype(
                        '<U1').tolist()
            else:
                raise Exception(
                    'Configuration noise label is not implemented yet.')

    def lower_upper_probability(self, feature, feature_value, ncc_s_param,
                                feature_class_name, ncc_epsilon):
        """
         ... note:
            zero float division can happen if too many input features
            To avoid probability zero, we use the Laplace Smoothing
                https://en.wikipedia.org/wiki/Additive_smoothing
        :param feature:
        :param feature_value:
        :param ncc_s_param:
        :param feature_class_name:
        :param ncc_epsilon:
        :return:
        """
        def __restricting_idm(probability, ncc_epsilon_ip, len_features):
            return (1 - ncc_epsilon_ip
                    ) * probability + ncc_epsilon_ip / len_features

        f_val_index = self.feature_values[feature].index(feature_value)  #
        num_items = float(sum(self.feature_count[feature_class_name]))
        n_fi_c = self.feature_count[feature_class_name][
            f_val_index]  # n(f_i|c)
        len_fi = len(self.feature_count[feature_class_name])  # |F_i|
        # n(f_i|c)/(n(c)+s), lower probability: t(f_1|c)->0, t(c)->1
        try:
            p_lower = (n_fi_c / (num_items + ncc_s_param))
        except ZeroDivisionError:
            p_lower = (n_fi_c + 1) / (num_items + ncc_s_param + len_fi)
        # (n(f_i|c)+s)/(n(c)+s), upper probability: t(f_1|c)->1, t(c)->1
        try:
            p_upper = ((n_fi_c + ncc_s_param) / (num_items + ncc_s_param))
        except ZeroDivisionError:
            p_upper = ((n_fi_c + ncc_s_param + 1) /
                       (num_items + ncc_s_param + len_fi))
        # some regularization with epsilon
        p_lower = __restricting_idm(p_lower, ncc_epsilon, len_fi)
        p_upper = __restricting_idm(p_upper, ncc_epsilon, len_fi)
        return p_lower, p_upper

    def lower_upper_marginal_Y(self, idx_label_to_infer, value_label_to_infer,
                               ncc_s_param):
        # @salmuz: missing apply Laplace Smoothing when n_label_data=0 and ncc_s_parm = 0
        count_label = self.marginal_props[idx_label_to_infer][
            value_label_to_infer]
        n_label_data = self.marginal_props[idx_label_to_infer]["all"]
        p_lower = count_label / (n_label_data + ncc_s_param)
        p_upper = (count_label + ncc_s_param) / (n_label_data + ncc_s_param)
        self._logger.debug(
            "[Bound-Marginal] (idx_label_to_infer, p_lower, p_upper ) (%s, %s, %s)",
            idx_label_to_infer, p_lower, p_upper)
        return p_lower, p_upper

    def lower_upper_probability_feature(self, idx_label_to_infer, item,
                                        ncc_s_param, ncc_epsilon):
        # (n(c)+st(c))/(N+s), with s=0 (i.e. prior probabilities precise, P(Y))
        if self.has_imprecise_marginal:
            l_denominator_0, u_denominator_0 = self.lower_upper_marginal_Y(
                idx_label_to_infer, 0, ncc_s_param)
            l_numerator_1, u_numerator_1 = self.lower_upper_marginal_Y(
                idx_label_to_infer, 1, ncc_s_param)
        else:
            all_bits_label = self.marginal_props[idx_label_to_infer]["all"]
            # Applying Laplace Smoothing (where |C| = 2, binary case):
            #       P(Y_i = idx_label_to_infer) = (n(idx_label_to_infer) + 1)/(n + |C|)
            prop_marginal_label_1 = (self.marginal_props[idx_label_to_infer][1]
                                     + 1) / (all_bits_label + 2)
            u_denominator_0 = 1 - prop_marginal_label_1  # \overline P(Yj=0)
            l_denominator_0 = 1 - prop_marginal_label_1  # \underline P(Yj=0)
            u_numerator_1 = prop_marginal_label_1  # \overline P(Yj=1)
            l_numerator_1 = prop_marginal_label_1  # \underline P(Yj=1)

        for f_index, feature in enumerate(self.feature_names):
            # computation of denominator (label=1)
            feature_class_name = self.label_names[
                idx_label_to_infer] + '|in|' + feature  # (f_i, c=1)
            p_lower, p_upper = self.lower_upper_probability(
                feature, item[f_index], ncc_s_param, feature_class_name,
                ncc_epsilon)
            l_numerator_1 = l_numerator_1 * p_lower  # prod \underline{P}(f_i|c=1)
            u_numerator_1 = u_numerator_1 * p_upper  # prod \overline{P}(f_i|c=1)

            # computation of numerator (label=0)
            feature_class_name = self.label_names[
                idx_label_to_infer] + '|out|' + feature
            p_lower, p_upper = self.lower_upper_probability(
                feature, item[f_index], ncc_s_param, feature_class_name,
                ncc_epsilon)
            l_denominator_0 = l_denominator_0 * p_lower  # prod \underline{P}(f_i|c=0)
            u_denominator_0 = u_denominator_0 * p_upper  # prod \overline{P}(f_i|c=0)

        return u_numerator_1, l_numerator_1, u_denominator_0, l_denominator_0

    def lower_upper_probability_labels(self,
                                       idx_label_to_infer,
                                       augmented_labels,
                                       ncc_s_param,
                                       ncc_epsilon,
                                       idx_chain_predict_labels=None):
        """
        :param idx_label_to_infer: name of label selected
        :param augmented_labels: list of characters values '0' or '1'
        :param ncc_s_param:
        :param ncc_epsilon:
        :param idx_chain_predict_labels:
        :return:
        """
        u_numerator_1, l_numerator_1, u_denominator_0, l_denominator_0 = 1, 1, 1, 1
        if idx_chain_predict_labels is None:
            dependant_labels = enumerate(
                self.label_names[:len(augmented_labels)])
        else:
            dependant_labels = zip(idx_chain_predict_labels,
                                   self.label_names[idx_chain_predict_labels])

        self._logger.debug(
            "[Bound-Labels] (label_to_infer, augmented_labels, idx_chain_predict_labels ) (%s, %s, %s)",
            self.label_names[idx_label_to_infer], augmented_labels,
            idx_chain_predict_labels)

        for l_index, label in dependant_labels:
            label_predicted_value = str(augmented_labels[l_index])
            # computation of denominator (label=1)
            label_class_name = self.label_names[
                idx_label_to_infer] + '|in|' + label  # (l_i=1, c=1)
            p_lower, p_upper = self.lower_upper_probability(
                label, label_predicted_value, ncc_s_param, label_class_name,
                ncc_epsilon)
            l_numerator_1 = l_numerator_1 * p_lower  # prod \underline{P}(f_i|c=1)
            u_numerator_1 = u_numerator_1 * p_upper  # prod \overline{P}(f_i|c=1)

            # computation of numerator (label=0)
            label_class_name = self.label_names[
                idx_label_to_infer] + '|out|' + label  # (l_i=0, c=0)
            p_lower, p_upper = self.lower_upper_probability(
                label, label_predicted_value, ncc_s_param, label_class_name,
                ncc_epsilon)
            l_denominator_0 = l_denominator_0 * p_lower  # prod \underline{P}(f_i|c=0)
            u_denominator_0 = u_denominator_0 * p_upper  # prod \overline{P}(f_i|c=0)
        return u_numerator_1, l_numerator_1, u_denominator_0, l_denominator_0

    def lower_upper_cond_probability(self,
                                     idx_label_to_infer,
                                     instance,
                                     augmented_labels,
                                     ncc_s_param,
                                     ncc_epsilon,
                                     idx_chain_predict_labels=None):
        """
        .. note::
            TO DO: To avoid probability zero, we use the Laplace Smoothing
                https://en.wikipedia.org/wiki/Additive_smoothing
        :param idx_label_to_infer:
        :param instance:
        :param augmented_labels:
        :param ncc_s_param:
        :param ncc_epsilon:
        :param idx_chain_predict_labels:
        :return:
        """

        u_numerator_1, l_numerator_1, u_denominator_0, l_denominator_0 = \
            self.lower_upper_probability_feature(idx_label_to_infer,
                                                 instance,
                                                 ncc_s_param,
                                                 ncc_epsilon)

        u_numerator_label_1, l_numerator_label_1, u_denominator_label_0, l_denominator_label_0 = \
            self.lower_upper_probability_labels(idx_label_to_infer,
                                                augmented_labels,
                                                ncc_s_param,
                                                ncc_epsilon,
                                                idx_chain_predict_labels)

        u_numerator_1 = u_numerator_1 * u_numerator_label_1
        l_numerator_1 = l_numerator_1 * l_numerator_label_1
        u_denominator_0 = u_denominator_0 * u_denominator_label_0
        l_denominator_0 = l_denominator_0 * l_denominator_label_0
        return u_numerator_1, l_numerator_1, u_denominator_0, l_denominator_0
def computing_best_imprecise_mean(in_path=None,
                                  out_path=None,
                                  cv_nfold=10,
                                  model_type="ilda",
                                  test_size=0.4,
                                  from_ell=0.1,
                                  to_ell=1.0,
                                  by_ell=0.1,
                                  seeds=None,
                                  lib_path_server=None,
                                  nb_process=2,
                                  n_sampling=10,
                                  skip_n_sample=0,
                                  criterion="maximality",
                                  scaling=False):
    assert os.path.exists(in_path), "Without training data, not testing"
    assert os.path.exists(out_path), "File for putting results does not exist"

    logger = create_logger("computing_best_imprecise_mean_sampling", True)
    logger.info('Training dataset (%s, %s, %s)', in_path, model_type,
                criterion)
    logger.info(
        'Parameters (size, ells, nbProcess, sampling, nSkip) (%s, %s, %s, %s, %s, %s, %s)',
        test_size, from_ell, to_ell, by_ell, nb_process, n_sampling,
        skip_n_sample)
    data = pd.read_csv(in_path, header=None)
    X = data.iloc[:, :-1].values
    if scaling: X = normalize_minmax(X)
    y = np.array(data.iloc[:, -1].tolist())

    # Seed for get back up if process is killed
    seeds = generate_seeds(n_sampling) if seeds is None else seeds
    logger.debug("MODEL: %s, SEED: %s", model_type, seeds)

    # Create a CSV file for saving results
    file_csv = open(out_path, 'a')
    writer = csv.writer(file_csv)
    manager = ManagerWorkers(nb_process=nb_process, criterion=criterion)
    manager.executeAsync(model_type, lib_path_server)
    acc_u80, acc_u65 = dict(), dict()
    for sampling in range(min(n_sampling, len(seeds))):
        X_learning, X_testing, y_learning, y_testing = \
            train_test_split(X, y, test_size=test_size, random_state=seeds[sampling])
        logger.info("Splits %s learning %s", sampling, y_learning)
        logger.info("Splits %s testing %s", sampling, y_testing)

        # n-Skipping sampling and reboot parameter from_ell to 0.01 next sampling
        if skip_n_sample != 0 and sampling > skip_n_sample: from_ell = 0.01
        # n-Skipping sampling testing (purpose for parallel computing)
        if sampling >= skip_n_sample:
            kf = KFold(n_splits=cv_nfold, random_state=None, shuffle=True)
            ell_u65, ell_u80, splits = dict(), dict(), list([])
            for idx_train, idx_test in kf.split(y_learning):
                splits.append((idx_train, idx_test))
                logger.info("Sampling %s Splits %s train %s", sampling,
                            len(splits), idx_train)
                logger.info("Sampling %s Splits %s test %s", sampling,
                            len(splits), idx_test)

            for ell_current in np.arange(from_ell, to_ell, by_ell):
                ell_u65[ell_current], ell_u80[ell_current] = 0, 0
                logger.info("ELL_CURRENT %s", ell_current)
                for idx_train, idx_test in splits:
                    logger.info("Splits train %s", idx_train)
                    logger.info("Splits test %s", idx_test)
                    X_cv_train, y_cv_train = X_learning[idx_train], y_learning[
                        idx_train]
                    X_cv_test, y_cv_test = X_learning[idx_test], y_learning[
                        idx_test]
                    # Computing accuracy testing for cross-validation step
                    ell_u65[ell_current], ell_u80[ell_current] = \
                        computing_training_testing_step(X_cv_train, y_cv_train, X_cv_test, y_cv_test, ell_current,
                                                        manager, ell_u65[ell_current], ell_u80[ell_current])
                    logger.info("Partial-kfold (%s, %s, %s)", ell_current,
                                ell_u65[ell_current], ell_u80[ell_current])

                ell_u65[ell_current] = ell_u65[ell_current] / cv_nfold
                ell_u80[ell_current] = ell_u80[ell_current] / cv_nfold
                writer.writerow([
                    ell_current, sampling, ell_u65[ell_current],
                    ell_u80[ell_current]
                ])
                file_csv.flush()
                logger.debug("Partial-ell-sampling (%s, %s, %s, %s)",
                             ell_current, sampling, ell_u65, ell_u80)
            logger.debug("Total-ell-sampling (%s, %s, %s, %s)", in_path,
                         sampling, ell_u65, ell_u80)

            # Computing optimal ells for using in testing step
            acc_ellu80 = max(ell_u80.values())
            acc_ellu65 = max(ell_u65.values())
            ellu80_opts = [k for k, v in ell_u80.items() if v == acc_ellu80]
            ellu65_opts = [k for k, v in ell_u65.items() if v == acc_ellu65]
            acc_u65[sampling], acc_u80[sampling] = 0, 0
            n_ell80_opts, n_ell65_opts = len(ellu80_opts), len(ellu65_opts)

            for ellu80_opt in ellu80_opts:
                logger.info("ELL_OPTIMAL_SAMPLING_U80 %s", ellu80_opt)
                _, acc_u80[sampling] = \
                    computing_training_testing_step(X_learning, y_learning, X_testing, y_testing, ellu80_opt,
                                                    manager, 0, acc_u80[sampling])

            for ellu65_opt in ellu65_opts:
                logger.info("ELL_OPTIMAL_SAMPLING_U65 %s", ellu65_opt)
                acc_u65[sampling], _ = \
                    computing_training_testing_step(X_learning, y_learning, X_testing, y_testing, ellu65_opt,
                                                    manager, acc_u65[sampling], 0)

            acc_u65[sampling] = acc_u65[sampling] / n_ell65_opts
            acc_u80[sampling] = acc_u80[sampling] / n_ell80_opts
            writer.writerow(
                [-999, sampling, acc_u65[sampling], acc_u80[sampling]])
            file_csv.flush()
            logger.debug("Partial-ell-2step (%s, %s, %s, %s)", -999,
                         ellu80_opts, acc_u65[sampling], acc_u80[sampling])

    writer.writerow([
        -9999, -9,
        np.mean(list(acc_u65.values())),
        np.mean(list(acc_u80.values()))
    ])
    manager.poisonPillTraining()
    file_csv.close()
    logger.debug("Total-accuracy (%s, %s, %s)", in_path, acc_u65, acc_u80)
    logger.debug("Total-avg-accuracy (%s, %s, %s)", in_path,
                 np.mean(list(acc_u65.values())),
                 np.mean(list(acc_u80.values())))
예제 #17
0
def experiments_binr_vs_imprecise(in_path=None,
                                  out_path=None,
                                  seed=None,
                                  missing_pct=0.0,
                                  noise_label_pct=0.0,
                                  noise_label_type=-1,
                                  noise_label_prob=0.5,
                                  nb_kFold=10,
                                  nb_process=1,
                                  scaling=False,
                                  epsilon_rejects=None,
                                  min_ncc_s_param=0.5,
                                  max_ncc_s_param=6.0,
                                  step_ncc_s_param=1.0,
                                  remove_features=None,
                                  k_nearest_neighbors=None):
    """
    Experiments with binary relevant imprecise and missing/noise data.

    :param in_path:
    :param out_path:
    :param seed:
    :param missing_pct: percentage of missing labels
    :param noise_label_pct: percentage of noise labels
    :param noise_label_type: type of perturbation noise
    :param noise_label_prob: probaiblity of noise labesl
    :param nb_kFold:
    :param nb_process: number of process in parallel
    :param scaling: scaling X input space (used for kkn-nccbr classifier)
    :param epsilon_rejects: epsilon of reject option (for comparing with imprecise version)
    :param min_ncc_s_param: minimum value of imprecise parameter s
    :param max_ncc_s_param: maximum value of imprecise parameter s
    :param step_ncc_s_param: discretization step of parameter s
    :param remove_features: features not to take into account
    :param k_nearest_neighbors: k*radius_distance_pairwise_all_instance,
            how big is ball containing neighbors.

    ...note::
        TODO: Bug when the missing percentage is higher (90%) to fix.

    """
    assert os.path.exists(in_path), "Without training data, not testing"
    assert os.path.exists(out_path), "File for putting results does not exist"
    assert k_nearest_neighbors is not None, "None value, it needs a value for the knn algorithm"
    assert k_nearest_neighbors > 0, "Need a value for the knn algorithm"

    logger = create_logger("computing_best_imprecise_mean", True)
    logger.info('Training dataset (%s, %s)', in_path, out_path)
    logger.info(
        "(min_ncc_s_param, max_ncc_s_param, step_ncc_s_param) (%s, %s, %s)",
        min_ncc_s_param, max_ncc_s_param, step_ncc_s_param)
    logger.info(
        "(scaling, remove_features, process, epsilon_rejects) (%s, %s, %s, %s)",
        scaling, remove_features, nb_process, epsilon_rejects)
    logger.info(
        "(missing_pct, noise_label_pct, noise_label_type, noise_label_prob) (%s, %s, %s, %s)",
        missing_pct, noise_label_pct, noise_label_type, noise_label_prob)
    logger.info("( k_nearest_neighbors)  (%s)", k_nearest_neighbors)

    # Seeding a random value for k-fold top learning-testing data
    if seed is None:
        seed = [random.randrange(sys.maxsize) for _ in range(nb_kFold)]
    logger.debug("[FIRST-STEP-SEED] SEED: %s", seed)

    # Create a CSV file for saving results
    file_csv = open(out_path, 'a')
    writer = csv.writer(file_csv)

    manager = ManagerWorkers(nb_process=nb_process,
                             fun_prediction=skeptical_prediction)
    manager.executeAsync(
        class_model="classifip.models.mlc.knnnccbr.KNN_NCC_BR")

    ich_skep, cph_skep, acc_prec = dict(), dict(), dict()
    ich_reject, cph_reject = dict(), dict()
    min_discretize, max_discretize = 5, 7
    for nb_disc in range(min_discretize, max_discretize):
        data_learning, nb_labels = init_dataset(in_path, remove_features,
                                                scaling)
        p_dimension = len(data_learning.data[0]) - nb_labels

        # saving continuous data and index instances for KNN-NCC-BR classification
        data_continuous = data_learning.make_clone()
        # adding raw-index to each instance if we use knn-ncc
        for idx, row_instance in enumerate(data_learning.data):
            row_instance.insert(p_dimension + nb_labels, idx)
        data_learning.discretize(discmet="eqfreq", numint=nb_disc)

        for time in range(nb_kFold):  # 10-10 times cross-validation
            logger.info(
                "Number interval for discreteness and labels (%1d, %1d)." %
                (nb_disc, nb_labels))
            cv_kfold = k_fold_cross_validation(data_learning,
                                               nb_kFold,
                                               randomise=True,
                                               random_seed=seed[time])

            splits_s = list([])
            for training, testing in cv_kfold:
                # making a clone because it send the same address memory
                splits_s.append((training.make_clone(), testing.make_clone()))
                logger.info("Splits %s train %s", len(training.data),
                            training.data[0][1:4])
                logger.info("Splits %s test %s", len(testing.data),
                            testing.data[0][1:4])

            disc = str(nb_disc) + "-" + str(time)
            ich_skep[disc], cph_skep[disc], acc_prec[disc] = dict(), dict(
            ), dict()
            ich_reject[disc], cph_reject[disc] = dict(), dict()
            for s_ncc in np.arange(min_ncc_s_param, max_ncc_s_param,
                                   step_ncc_s_param):
                ks_ncc = str(s_ncc)
                init_scores(ks_ncc, ich_skep[disc], cph_skep[disc],
                            acc_prec[disc], ich_reject[disc], cph_reject[disc],
                            epsilon_rejects)
                for idx_fold, (training, testing) in enumerate(splits_s):
                    logger.info("Splits %s train %s", len(training.data),
                                training.data[0][1:4])
                    logger.info("Splits %s test %s", len(testing.data),
                                testing.data[0][1:4])
                    rs = computing_training_testing_step(
                        training, testing, missing_pct, noise_label_pct,
                        noise_label_type, noise_label_prob, nb_labels,
                        p_dimension, s_ncc, manager, epsilon_rejects,
                        ich_skep[disc][ks_ncc], cph_skep[disc][ks_ncc],
                        acc_prec[disc][ks_ncc], ich_reject[disc][ks_ncc],
                        cph_reject[disc][ks_ncc], data_continuous,
                        k_nearest_neighbors)
                    ich_skep[disc][ks_ncc], cph_skep[disc][ks_ncc] = rs[0], rs[
                        1]
                    acc_prec[disc][ks_ncc] = rs[2]
                    ich_reject[disc][ks_ncc], cph_reject[disc][ks_ncc] = rs[
                        3], rs[4]
                    logger.debug("Partial-s-k_step (acc, ich_skep) (%s, %s)",
                                 acc_prec[disc][ks_ncc],
                                 ich_skep[disc][ks_ncc])
                ich_skep[disc][ks_ncc] = ich_skep[disc][ks_ncc] / nb_kFold
                cph_skep[disc][ks_ncc] = cph_skep[disc][ks_ncc] / nb_kFold
                acc_prec[disc][ks_ncc] = acc_prec[disc][ks_ncc] / nb_kFold
                _partial_saving = [
                    str(nb_disc), s_ncc, time, ich_skep[disc][ks_ncc],
                    cph_skep[disc][ks_ncc], acc_prec[disc][ks_ncc]
                ]
                if epsilon_rejects is not None:
                    _reject_ich = [
                        e / nb_kFold
                        for e in ich_reject[disc][ks_ncc].values()
                    ]
                    _reject_cph = [
                        e / nb_kFold
                        for e in cph_reject[disc][ks_ncc].values()
                    ]
                    _partial_saving = _partial_saving + _reject_ich + _reject_cph
                else:
                    _reject_ich, _reject_cph = [], []
                logger.debug("Partial-s-k_step reject values (%s)",
                             ich_reject[disc][ks_ncc])
                writer.writerow(_partial_saving)
                file_csv.flush()
                logger.debug(
                    "Partial-s-k_step (disc, s, time, ich_skep, cph_skep, acc, ich_reject, cph_reject)"
                    "(%s, %s, %s, %s, %s, %s, %s, %s)", disc, s_ncc, time,
                    ich_skep[disc][ks_ncc], cph_skep[disc][ks_ncc],
                    acc_prec[disc][ks_ncc], _reject_ich, _reject_cph)
    manager.poisonPillTraining()
    file_csv.close()
    logger.debug("Results Final: %s, %s, %s", ich_skep, cph_skep, acc_prec)
예제 #18
0
 def __init__(self, DEBUG=False):
     super(BinaryILogisticLasso, self).__init__(DEBUG)
     self._logger = create_logger("BinaryILogistic", DEBUG)
     self._lasso_models = None
     self._precise_logit = None
     self._gammas = None
예제 #19
0
 def __init__(self, DEBUG=False):
     super(MLChaining, self).__init__(DEBUG)
     self._logger = create_logger("MLChaining", DEBUG)
예제 #20
0
def computing_best_imprecise_mean(in_path=None, out_path=None, lib_path_server=None, model_type="ilda",
                                  from_ell=0.1, to_ell=1.0, by_ell=0.1, seed=None, cv_kfold_first=10,
                                  nb_process=2, skip_nfold=0, cv_kfold_second=10, seed_second=None, scaling=False):
    assert os.path.exists(in_path), "Without training data, not testing"
    assert os.path.exists(out_path), "File for putting results does not exist"

    logger = create_logger("computing_best_imprecise_mean_cv", True)
    logger.info('Training dataset (%s, %s, %s)', in_path, out_path, model_type)
    logger.info('Parameters (ells, nbProcess, skip_nfold, cv_kfold_second) (%s, %s, %s, %s, %s, %s)', from_ell,
                to_ell, by_ell, nb_process, skip_nfold, cv_kfold_second)

    data = pd.read_csv(in_path, header=None)
    X = data.iloc[:, :-1].values
    if scaling: X = normalize_minmax(X)
    y = np.array(data.iloc[:, -1].tolist())

    # Seeding a random value for k-fold top learning-testing data
    seed = random.randrange(pow(2, 30)) if seed is None else seed
    logger.debug("[FIRST-STEP-SEED] MODEL: %s, SEED: %s", model_type, seed)

    # Create a CSV file for saving results
    file_csv = open(out_path, 'a')
    writer = csv.writer(file_csv)
    manager = ManagerWorkers(nb_process=nb_process)
    manager.executeAsync(model_type, lib_path_server)

    kfFirst = KFold(n_splits=cv_kfold_first, random_state=seed, shuffle=True)
    acc_u80, acc_u65, idx_kfold = dict(), dict(), 0
    seed_2step = generate_seeds(cv_kfold_second) if seed_second is None else seed_second
    logger.debug("[SECOND-STEP-SEEDS] MODEL: %s, SEED: %s, SECOND-SEED: %s", model_type, seed, seed_2step)
    for idx_learning, idx_testing in kfFirst.split(y):
        ell_u65, ell_u80 = dict(), dict()
        # Generate sampling k-fold (learning, testing) for optimal ell parameters
        X_learning, y_learning = X[idx_learning], y[idx_learning]
        X_testing, y_testing = X[idx_testing], y[idx_testing]
        logger.info("Splits %s learning %s", idx_kfold, idx_learning)
        logger.info("Splits %s testing %s", idx_kfold, idx_testing)

        # # n-Skipping sampling and reboot parameter from_ell to 0.01 next sampling
        if skip_nfold != 0 and idx_kfold > skip_nfold:
            from_ell = 0.01

        # n-Skipping fold cross-validation (purpose for parallel computing)
        if idx_kfold >= skip_nfold:
            # Generate same k-fold-second (train, test) for impartially computing accuracy all ell parameters
            splits_ell = list([])
            logger.debug("[2-STEP-SEED] MODEL: %s, SEED: %s OF FIRST STEP %s", model_type, seed_2step[idx_kfold], seed)
            kfSecond = KFold(n_splits=cv_kfold_second, random_state=seed_2step[idx_kfold], shuffle=True)
            for idx_learn_train, idx_learn_test in kfSecond.split(y_learning):
                splits_ell.append((idx_learn_train, idx_learn_test))
                logger.info("Splits %s train %s", len(splits_ell), idx_learn_train)
                logger.info("Splits %s test %s", len(splits_ell), idx_learn_test)

            for ell_current in np.arange(from_ell, to_ell, by_ell):
                ell_u65[ell_current], ell_u80[ell_current] = 0, 0
                logger.info("ELL_CURRENT %s", ell_current)
                for idx_learn_train, idx_learn_test in splits_ell:
                    logger.info("Splits step train %s", idx_learn_train)
                    logger.info("Splits step test %s", idx_learn_test)
                    X_cv_train, y_cv_train = X_learning[idx_learn_train], y_learning[idx_learn_train]
                    X_cv_test, y_cv_test = X_learning[idx_learn_test], y_learning[idx_learn_test]

                    ell_u65[ell_current], ell_u80[ell_current], _ = \
                        computing_training_testing_step(X_cv_train, y_cv_train, X_cv_test, y_cv_test, ell_current,
                                                        manager, ell_u65[ell_current], ell_u80[ell_current])

                    logger.info("Partial-kfold (%s, %s, %s)", ell_current, ell_u65[ell_current], ell_u80[ell_current])
                ell_u65[ell_current] = ell_u65[ell_current] / cv_kfold_first
                ell_u80[ell_current] = ell_u80[ell_current] / cv_kfold_first
                writer.writerow([ell_current, idx_kfold, ell_u65[ell_current], ell_u80[ell_current]])
                file_csv.flush()
                logger.debug("Partial-ell-k-step (%s, %s, %s)", idx_kfold, ell_u65[ell_current], ell_u80[ell_current])
            logger.debug("Total-ell-k-step (%s, %s, %s, %s)", in_path, idx_kfold, ell_u65, ell_u80)

            # Computing optimal ells for using in testing step
            acc_ell_u80 = max(ell_u80.values())
            acc_ell_u65 = max(ell_u65.values())
            ell_u80_opts = [k for k, v in ell_u80.items() if v == acc_ell_u80]
            ell_u65_opts = [k for k, v in ell_u65.items() if v == acc_ell_u65]
            acc_u65[idx_kfold], acc_u80[idx_kfold] = 0, 0
            n_ell80_opts, n_ell65_opts = len(ell_u80_opts), len(ell_u65_opts)
            for ell_u80_opt in ell_u80_opts:
                logger.info("ELL_OPTIMAL_CV_U80 %s", ell_u80_opt)
                _, _acc_u80, _ = \
                    computing_training_testing_step(X_learning, y_learning, X_testing, y_testing, ell_u80_opt,
                                                    manager, 0, 0)
                acc_u80[idx_kfold] += _acc_u80
                writer.writerow([-999, -8, ell_u80_opt, _acc_u80])

            for ell_u65_opt in ell_u65_opts:
                logger.info("ELL_OPTIMAL_CV_U65 %s", ell_u65_opt)
                _acc_u65, _, _ = \
                    computing_training_testing_step(X_learning, y_learning, X_testing, y_testing, ell_u65_opt,
                                                    manager, 0, 0)
                acc_u65[idx_kfold] += _acc_u65
                writer.writerow([-999, -7, ell_u65_opt, _acc_u65])

            acc_u65[idx_kfold] = acc_u65[idx_kfold] / n_ell65_opts
            acc_u80[idx_kfold] = acc_u80[idx_kfold] / n_ell80_opts
            writer.writerow([-999, idx_kfold, acc_u65[idx_kfold], acc_u80[idx_kfold]])
            file_csv.flush()
            logger.debug("Partial-ell-2step (u80, u65, accs) (%s, %s, %s, %s, %s)", -999, ell_u80_opts, ell_u65_opts,
                         acc_u65[idx_kfold], acc_u80[idx_kfold])
        idx_kfold += 1
    writer.writerow([-9999, -9, np.mean(list(acc_u65.values())), np.mean(list(acc_u80.values()))])
    manager.poisonPillTraining()
    file_csv.close()
    logger.debug("Total-accuracy (%s, %s, %s)", in_path, acc_u65, acc_u80)
    logger.debug("Total-avg-accuracy (%s, %s, %s)", in_path, np.mean(list(acc_u65.values())),
                 np.mean(list(acc_u80.values())))
예제 #21
0
def performance_qda_regularized(in_path=None,
                                out_path=None,
                                cv_n_fold=10,
                                seeds=None,
                                from_alpha=0,
                                to_alpha=2.0,
                                by_alpha=0.01,
                                scaling=False):
    assert os.path.exists(in_path), "Without training data, not testing"
    assert os.path.exists(out_path), "Without output saving performance"
    data = pd.read_csv(in_path, header=None)
    logger = create_logger("performance_qda_regularized", True)
    logger.info('Training data set %s, cv_n_fold %s, model_type %s', in_path,
                cv_n_fold, "qda")
    X = data.iloc[:, :-1].values
    if scaling: X = normalize_minmax(X)
    y = np.array(data.iloc[:, -1].tolist())

    seeds = generate_seeds(cv_n_fold) if seeds is None else seeds
    logger.info('Seeds generated %s', seeds)

    file_csv = open(out_path, 'a')
    writer = csv.writer(file_csv)

    alphas = np.arange(from_alpha, to_alpha, by_alpha)
    writer.writerow(alphas)

    qda_regularized = [None] * len(alphas)
    for idx, alpha in enumerate(alphas):
        qda_regularized[idx] = __factory_model_precise("qda",
                                                       store_covariance=True,
                                                       reg_param=alpha)
    # Generation a random k-fold validation.
    kf_second = KFold(n_splits=cv_n_fold, random_state=None, shuffle=True)
    ikfold, accuracy, best_alphas = 0, [0] * cv_n_fold, [0] * cv_n_fold
    for idx_learning, idx_testing in kf_second.split(y):
        X_training, y_training = X[idx_learning], y[idx_learning]
        X_testing, y_testing = X[idx_testing], y[idx_testing]
        kf = KFold(n_splits=cv_n_fold,
                   random_state=seeds[ikfold],
                   shuffle=True)
        acc_u80 = [0] * len(qda_regularized)
        for idx_train, idx_test in kf.split(y_training):
            X_cv_train, y_cv_train = X_training[idx_train], y_training[
                idx_train]
            X_cv_test, y_cv_test = X_training[idx_test], y_training[idx_test]
            for model in qda_regularized:
                model.fit(X_cv_train, y_cv_train)
            n_test = len(idx_test)
            for i, test in enumerate(X_cv_test):
                for im, model in enumerate(qda_regularized):
                    evaluate = model.predict([test])
                    if y_cv_test[i] in evaluate:
                        acc_u80[im] += (u80(evaluate) / n_test) / cv_n_fold
        idx_best = np.argmax(acc_u80)
        logger.info("[1kfold:best_model:seed:u80] (%s, %s, %s, %s)", ikfold,
                    alphas[idx_best], seeds[ikfold], acc_u80)
        writer.writerow(acc_u80)
        file_csv.flush()

        best_model = __factory_model_precise("qda",
                                             store_covariance=True,
                                             reg_param=alphas[idx_best])
        best_model.fit(X_training, y_training)
        accuracy[ikfold], bn_test, best_alphas[ikfold] = 0, len(
            idx_testing), alphas[idx_best]
        for i, test in enumerate(X_testing):
            evaluate = best_model.predict([test])
            if y_testing[i] in evaluate:
                accuracy[ikfold] += u80(evaluate) / bn_test
        logger.info("[2kfold:best_model:seed:accuracy] (%s, %s, %s)", ikfold,
                    alphas[idx_best], accuracy[ikfold])
        ikfold += 1
    file_csv.close()
    logger.info("[total:data-set:avgResults] (%s, %s, %s, %s)", in_path,
                np.mean(accuracy), best_alphas, accuracy)
예제 #22
0
def experiments_binr_vs_imprecise(in_path=None,
                                  out_path=None,
                                  seed=None,
                                  missing_pct=0.0,
                                  noise_label_pct=0.0,
                                  noise_label_type=-1,
                                  noise_label_prob=0.5,
                                  nb_kFold=10,
                                  nb_process=1,
                                  scaling=False,
                                  epsilon_rejects=None,
                                  min_ell_param=0.5,
                                  max_ell_param=6.0,
                                  step_ell_param=1.0,
                                  remove_features=None,
                                  is_resampling=False):
    """
    Experiments with binary relevant imprecise and missing/noise data.

    :param in_path:
    :param out_path:
    :param seed:
    :param missing_pct: percentage of missing labels
    :param noise_label_pct: percentage of noise labels
    :param noise_label_type: type of perturbation noise
    :param noise_label_prob: probaiblity of noise labesl
    :param nb_kFold:
    :param nb_process: number of process in parallel
    :param scaling: scaling X input space (used for kkn-nccbr classifier)
    :param epsilon_rejects: epsilon of reject option (for comparing with imprecise version)
    :param min_ell_param: minimum value of imprecise parameter s
    :param max_ell_param: maximum value of imprecise parameter s
    :param step_ell_param: discretization step of parameter s
    :param remove_features: features not to take into account
    :param is_resampling: if re-sampling of test and training data sets generated beforehand

    ...note::
        TODO: Bug when the missing percentage is higher (90%) to fix.

    """
    if not is_resampling:
        assert os.path.exists(in_path), "Without training data, not testing"
    assert os.path.exists(out_path), "File for putting results does not exist"

    logger = create_logger("computing_best_imprecise_mean", True)
    logger.info('Training dataset (%s, %s)', in_path, out_path)
    logger.info("(min_ncc_s_param, max_ncc_s_param, step_ncc_s_param) (%s, %s, %s)",
                min_ell_param, max_ell_param, step_ell_param)
    logger.info("(scaling, remove_features, process, epsilon_rejects) (%s, %s, %s, %s)",
                scaling, remove_features, nb_process, epsilon_rejects)
    logger.info("(missing_pct, noise_label_pct, noise_label_type, noise_label_prob) (%s, %s, %s, %s)",
                missing_pct, noise_label_pct, noise_label_type, noise_label_prob)

    # Seeding a random value for k-fold top learning-testing data
    if seed is None:
        seed = [random.randrange(sys.maxsize) for _ in range(nb_kFold)]
    logger.debug("[FIRST-STEP-SEED] SEED: %s", seed)

    # Create a CSV file for saving results
    file_csv = open(out_path, 'a')
    writer = csv.writer(file_csv)

    # Create a CSV file for saving query predictions new instances
    out_path_partial = out_path[:-4] + "partial.csv"
    if not os.path.exists(out_path_partial):
        with open(out_path_partial, 'w'): pass
    fpartial_csv = open(out_path_partial, 'a')
    wpartial = csv.writer(fpartial_csv)
    save_query = save_partial_query_classification(fpartial_csv, wpartial)

    # instance class classifier
    manager = ManagerWorkers(nb_process=nb_process, fun_prediction=skeptical_prediction)
    manager.executeAsync(class_model="classifip.models.mlc.igdabr.IGDA_BR")

    # c constant for abstained multilabel
    list_c_spe = [(num + 1) * .05 for num in range(10)]
    list_c_par = [(num + 1) * .1 for num in range(10)]

    # metrics performances
    metrics = MetricsPerformances(do_inference_exact=False,
                                  epsilon_rejects=epsilon_rejects,
                                  list_constants_spe=list_c_spe,
                                  list_constants_par=list_c_par)

    if not is_resampling:
        cv10x10fold_br_vs_ibr(logger, manager, metrics, remove_features, scaling, nb_kFold,
                              seed, writer, file_csv, min_ell_param, max_ell_param, step_ell_param,
                              missing_pct, noise_label_pct, noise_label_type, noise_label_prob)
    else:
        re_sampling_with_pct_train(logger, manager, metrics, nb_kFold, writer, file_csv,
                                   min_ell_param, max_ell_param, step_ell_param, missing_pct,
                                   noise_label_pct, noise_label_type, noise_label_prob, save_query)

    manager.poisonPillTraining()
    file_csv.close()
    fpartial_csv.close()
    logger.debug("Results Final: %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s",
                 metrics.ich_iid_skeptic, metrics.cph_iid_skeptic,
                 metrics.score_hamming,
                 metrics.ich_spe_partial, metrics.cph_spe_partial,
                 metrics.ich_par_partial, metrics.cph_par_partial,
                 metrics.spe_partial_score, metrics.par_partial_score,
                 metrics.ich_reject, metrics.cph_reject)
예제 #23
0
import numpy as np, random, os, time, sys
from classifip.utils import create_logger
from CSP_common import *
from classifip.models.ncclr import NCCLR
from classifip.evaluation.measures import correctness_measure, completeness_measure
from classifip.dataset.arff import ArffFile
from classifip.evaluation import k_fold_cross_validation
import multiprocessing
from functools import partial

logger = create_logger("computing_best_min_s_cross_validation", True)


def parallel_prediction_csp(model, test_data, dataset, evaluatePBOX):
    idx, pBox = evaluatePBOX
    predicts = model.inference_CSP([pBox])
    y_ground_truth = test_data[idx][-1].split(">")
    correctness = correctness_measure(y_ground_truth, predicts[0])
    completeness = completeness_measure(y_ground_truth, predicts[0])
    is_coherent = False
    # verify if the prediction is coherent
    pid = multiprocessing.current_process().name

    def _pinfo(message, kwargs):
        print("[" + pid + "][" + time.strftime('%x %X %Z') + "]",
              "-",
              message % kwargs,
              flush=True)

    if predicts[0] is not None:
        is_coherent = True
예제 #24
0
def experiments_chaining_imprecise(
        in_path=None,
        out_path=None,
        seed=None,
        nb_kFold=10,
        nb_process=1,
        min_ncc_s_param=0.5,
        max_ncc_s_param=6.0,
        step_ncc_s_param=1.0,
        missing_pct=0.0,
        noise_label_pct=0.0,
        noise_label_type=-1,
        noise_label_prob=0.5,
        remove_features=None,
        scaling=False,
        strategy_chaining=IMLCStrategy.IMPRECISE_BRANCHING,
        safety_chaining=False):
    assert os.path.exists(in_path), "Without training data, not testing"
    assert os.path.exists(out_path), "File for putting results does not exist"

    logger = create_logger("computing_best_imprecise_mean", True)
    logger.info('Training dataset (%s, %s)', in_path, out_path)
    logger.info(
        "(min_ncc_s_param, max_ncc_s_param, step_ncc_s_param) (%s, %s, %s)",
        min_ncc_s_param, max_ncc_s_param, step_ncc_s_param)
    logger.info("(scaling, remove_features, process) (%s, %s, %s)", scaling,
                remove_features, nb_process)
    logger.info(
        "(missing_pct, noise_label_pct, noise_label_type, noise_label_prob) (%s, %s, %s, %s)",
        missing_pct, noise_label_pct, noise_label_type, noise_label_prob)
    logger.info("(strategy_chaining, safety_chaining) (%s, %s)",
                strategy_chaining, safety_chaining)

    # Seeding a random value for k-fold top learning-testing data
    if seed is None:
        seed = [random.randrange(sys.maxsize) for _ in range(nb_kFold)]
    logger.debug("[FIRST-STEP-SEED] SEED: %s", seed)

    # Create a CSV file for saving results
    file_csv = open(out_path, 'a')
    writer = csv.writer(file_csv)
    manager = ManagerWorkers(nb_process=nb_process)
    manager.executeAsync(
        class_model="classifip.models.mlc.chainncc.MLChaining")

    ich, cph, acc, acc_trans, avg_sols = dict(), dict(), dict(), dict(), dict()
    min_discretize, max_discretize = 5, 7
    for nb_disc in range(min_discretize, max_discretize):
        data_learning = arff.ArffFile()
        data_learning.load(in_path)
        if remove_features is not None:
            for r_feature in remove_features:
                try:
                    data_learning.remove_col(r_feature)
                except Exception as err:
                    print("Remove feature error: {0}".format(err))
        nb_labels = get_nb_labels_class(data_learning)
        if scaling:
            normalize(data_learning, n_labels=nb_labels)
        data_learning.discretize(discmet="eqfreq", numint=nb_disc)

        for time in range(nb_kFold):  # 10-10 times cross-validation
            logger.info(
                "Number interval for discreteness and labels (%1d, %1d)." %
                (nb_disc, nb_labels))
            cv_kfold = k_fold_cross_validation(data_learning,
                                               nb_kFold,
                                               randomise=True,
                                               random_seed=seed[time])

            splits_s = list([])
            for training, testing in cv_kfold:
                train_clone_data = training.make_clone()
                test_clone_data = testing.make_clone()
                MLCNCC.shuffle_labels_train_testing(train_clone_data,
                                                    test_clone_data,
                                                    nb_labels=nb_labels)
                logger.info("Splits %s train %s", len(training.data),
                            training.data[0])
                logger.info("Splits %s test %s", len(testing.data),
                            testing.data[0])
                splits_s.append((train_clone_data, test_clone_data))

            disc = str(nb_disc) + "-" + str(time)
            ich[disc], cph[disc] = dict(), dict()
            acc_trans[disc], acc[disc] = dict(), dict()
            avg_sols[disc] = dict()
            for s_ncc in np.arange(min_ncc_s_param, max_ncc_s_param,
                                   step_ncc_s_param):
                ks_ncc = str(s_ncc)
                ich[disc][ks_ncc], cph[disc][ks_ncc] = 0, 0
                acc[disc][ks_ncc], acc_trans[disc][ks_ncc] = 0, 0
                avg_sols[disc][ks_ncc] = 0
                for idx_fold, (training, testing) in enumerate(splits_s):
                    res = computing_training_testing_step(
                        training, testing, nb_labels, s_ncc, manager,
                        strategy_chaining, safety_chaining, missing_pct,
                        noise_label_pct, noise_label_type, noise_label_prob,
                        ich[disc][ks_ncc], cph[disc][ks_ncc],
                        acc[disc][ks_ncc], acc_trans[disc][ks_ncc],
                        avg_sols[disc][ks_ncc])
                    ich[disc][ks_ncc], cph[disc][ks_ncc] = res[0], res[1]
                    acc[disc][ks_ncc], acc_trans[disc][ks_ncc] = res[2], res[3]
                    avg_sols[disc][ks_ncc] = res[4]
                    logger.debug(
                        "Partial-step-cumulative (acc, ich, acc_trans, avg_sols) (%s, %s, %s, %s)",
                        acc[disc][ks_ncc], ich[disc][ks_ncc],
                        acc_trans[disc][ks_ncc], avg_sols[disc][ks_ncc])
                writer.writerow([
                    str(nb_disc), s_ncc, time, ich[disc][ks_ncc] / nb_kFold,
                    cph[disc][ks_ncc] / nb_kFold, acc[disc][ks_ncc] / nb_kFold,
                    acc_trans[disc][ks_ncc] / nb_kFold,
                    avg_sols[disc][ks_ncc] / nb_kFold
                ])
                file_csv.flush()
                logger.debug("Partial-s-k_step (%s, %s, %s, %s, %s, %s)", disc,
                             s_ncc, time, ich[disc][ks_ncc] / nb_kFold,
                             cph[disc][ks_ncc] / nb_kFold,
                             acc_trans[disc][ks_ncc] / nb_kFold)
    manager.poisonPillTraining()
    file_csv.close()
    logger.debug("Results Final: %s, %s", ich, cph)
예제 #25
0
 def __init__(self, DEBUG=False):
     super(MLCNCCExact, self).__init__(DEBUG)
     self.power_set = []
     self.root = None
     self.DEBUG = DEBUG
     self._logger = create_logger("MLCNCCExact", DEBUG)