Esempio n. 1
0
def re_sampling_with_pct_train(logger, manager, metrics, nb_resampling, writer, file_csv,
                               min_ell_param, max_ell_param, step_ell_param,
                               missing_pct, noise_label_pct, noise_label_type,
                               noise_label_prob, save_query):

    for resampling in range(nb_resampling):
        for pct_training in np.arange(10, 100, 10):
            logger.info("Percentage/Resampling of training set: %s, %s.", pct_training, resampling)
            # Loading data set
            in_path_train = in_path % ("train", resampling + 1, int(pct_training))
            in_path_test = in_path % ("test", resampling + 1, int(pct_training))
            logger.info("Evaluate training/test data set: (%s, %s).", in_path_train, in_path_test)
            data_training, nb_labels = init_dataset(in_path_train, None, False)
            data_test, _ = init_dataset(in_path_test, None, False)

            # putting in 1 split training and test data set
            splits_s = [(data_training, data_test)]
            re_pct = str(resampling) + "-" + str(pct_training)
            metrics.init_sub_level(re_pct)
            for ell_imprecision in np.arange(min_ell_param, max_ell_param, step_ell_param):
                metrics.init_level_imprecision(str(ell_imprecision), re_pct)
                save_prediction = save_query(pct_training, ell_imprecision, resampling)
                # training and test a partition of sampling
                cv10fold_br_vs_ibr(logger, splits_s, ell_imprecision, nb_labels, manager, metrics, re_pct,
                                   missing_pct, noise_label_pct, noise_label_type, noise_label_prob,
                                   save_prediction)

                _partial_saving = metrics.generate_row_line(str(ell_imprecision),
                                                            resampling, 1, sub_level=re_pct)
                _partial_saving.insert(0, str(pct_training))
                writer.writerow(_partial_saving)
                file_csv.flush()
                logger.debug("Partial-ncc_step (re_pct, s, time, ich_skep, cph_skep, ich_out, "
                             "cph_out, acc, jacc, ich_reject, cph_reject, jacc_reject) (%s, %s, %s, %s)",
                             re_pct, ell_imprecision, resampling, metrics)
Esempio n. 2
0
def cv10x10fold_br_vs_ibr(logger, manager, metrics, remove_features, scaling, nb_kFold, seed,
                          writer, file_csv, min_ell_param, max_ell_param, step_ell_param,
                          missing_pct, noise_label_pct, noise_label_type, noise_label_prob):
    data_learning, nb_labels = init_dataset(in_path, remove_features, scaling)
    for time in range(nb_kFold):  # 10-10 times cross-validation
        logger.info("Number labels %s", nb_labels)
        cv_kfold = k_fold_cross_validation(data_learning,
                                           K=nb_kFold,
                                           randomise=True,
                                           random_seed=seed[time])

        splits_s = list([])
        for training, testing in cv_kfold:
            # making a clone because it send the same address memory
            splits_s.append((training.make_clone(), testing.make_clone()))
            logger.info("Splits %s train %s", len(training.data), training.data[0][1:4])
            logger.info("Splits %s test %s", len(testing.data), testing.data[0][1:4])

        for ell_imprecision in np.arange(min_ell_param, max_ell_param, step_ell_param):
            str_ell = str(ell_imprecision)
            metrics.init_level_imprecision(str_ell)
            cv10fold_br_vs_ibr(logger, splits_s, ell_imprecision, nb_labels, manager, metrics, None,
                               missing_pct, noise_label_pct, noise_label_type, noise_label_prob)
            _partial_saving = metrics.generate_row_line(str_ell, time, nb_kFold)
            writer.writerow(_partial_saving)
            file_csv.flush()
            logger.debug("Partial-s-k_step (ell, time, ich_skep, cph_skep, acc, "
                         "ich_reject, cph_reject) (%s, %s, %s)",
                         ell_imprecision, time, metrics)
Esempio n. 3
0
def re_sampling_with_pct_train(logger, in_path, nb_resampling, file_csv,
                               writer, metrics, manager, missing_pct,
                               noise_label_pct, noise_label_type,
                               noise_label_prob):
    for pct_training in np.arange(10, 100, 10):
        logger.info("Percentage of training set: %s.", pct_training)
        re_pct = str(pct_training)
        metrics.init_sub_level(re_pct)
        for resampling in range(nb_resampling):
            # Loading data set
            in_path_train = in_path % ("train", resampling + 1,
                                       int(pct_training))
            in_path_test = in_path % ("test", resampling + 1,
                                      int(pct_training))
            logger.info("Evaluate training/test data set: (%s, %s).",
                        in_path_train, in_path_test)
            data_training, nb_labels = init_dataset(in_path_train, None, False)
            data_test, _ = init_dataset(in_path_test, None, False)

            # putting in 1 split training and test data set
            splits_s = [(data_training, data_test)]
            # level gamma hyper-parameter
            metrics.init_level_imprecision(str(resampling), sub_level=re_pct)
            cv10fold_br_vs_ibr(logger, splits_s, resampling, nb_labels,
                               manager, metrics, re_pct, missing_pct,
                               noise_label_pct, noise_label_type,
                               noise_label_prob)

            _partial_saving = metrics.generate_row_line(str(resampling),
                                                        resampling,
                                                        1,
                                                        sub_level=re_pct)
            del _partial_saving[1]  # remove resampling replicate
            _partial_saving.insert(0, re_pct)
            writer.writerow(_partial_saving)
            file_csv.flush()
            logger.debug(
                "Partial-s-k_step (time, hamming, ich_skep) (%s, %s, %s)",
                resampling, metrics.score_hamming, metrics.ich_iid_skeptic)
Esempio n. 4
0
def cv10x10fold_br_vs_ibr(in_path, remove_features, scaling, logger, nb_kFold,
                          discretization, do_inference_exact, manager, metrics,
                          writer, seed, file_csv, missing_pct, noise_label_pct,
                          noise_label_type, noise_label_prob, min_ncc_s_param,
                          max_ncc_s_param, step_ncc_s_param):
    data_learning, nb_labels = init_dataset(in_path, remove_features, scaling)
    data_learning.discretize(discmet="eqfreq", numint=discretization)

    for time in range(nb_kFold):  # 10-10 times cross-validation
        logger.info("Number interval for discreteness and labels (%1d, %1d)." %
                    (discretization, nb_labels))
        cv_kfold = k_fold_cross_validation(data_learning,
                                           nb_kFold,
                                           randomise=True,
                                           random_seed=seed[time])

        splits_s = list([])
        for training, testing in cv_kfold:
            # making a clone because it send the same address memory
            splits_s.append((training.make_clone(), testing.make_clone()))
            logger.info("Splits %s train %s", len(training.data),
                        training.data[0][1:4])
            logger.info("Splits %s test %s", len(testing.data),
                        testing.data[0][1:4])

        disc_time = str(discretization) + "-" + str(time)
        metrics.init_sub_level(disc_time)
        for s_ncc in np.arange(min_ncc_s_param, max_ncc_s_param,
                               step_ncc_s_param):
            str_s_ncc = str(s_ncc)
            metrics.init_level_imprecision(str_s_ncc, disc_time)
            cv10fold_br_vs_ibr(splits_s, s_ncc, disc_time, manager, metrics,
                               logger, nb_labels, do_inference_exact,
                               missing_pct, noise_label_pct, noise_label_type,
                               noise_label_prob)

            _partial_saving = metrics.generate_row_line(str_s_ncc,
                                                        time,
                                                        nb_kFold,
                                                        sub_level=disc_time)
            _partial_saving.insert(0, str(discretization))
            writer.writerow(_partial_saving)
            file_csv.flush()
            logger.debug(
                "Partial-ncc_step (disc, s, time, ich_skep, cph_skep, ich_out, "
                "cph_out, acc, jacc, ich_reject, cph_reject, jacc_reject) (%s, %s, %s, %s)",
                disc_time, s_ncc, time, metrics)
Esempio n. 5
0
def create_dataset_by_percentage(in_path,
                                 out_path,
                                 pct_test,
                                 nb_samplings,
                                 dataset="emotions",
                                 scaling=True,
                                 nb_labels=None):
    seeds = generate_seeds(nb_samplings)
    data_learning, _ = init_dataset(in_path,
                                    remove_features=[],
                                    scaling=scaling,
                                    nb_labels=nb_labels)
    print("[SEEDs-GENERATION-PCT-TEST] ", pct_test, seeds, flush=True)
    for i in range(nb_samplings):
        str_pct_training = str(int(round((1 - pct_test) * 100)))
        file_test_name, file_train_name = generate_name(
            dataset, str(i + 1), str_pct_training)
        saving_data_sets(data_learning, out_path, pct_test, file_train_name,
                         file_test_name, seeds[i], i)
Esempio n. 6
0
def re_sampling_with_pct_train(in_path, logger, nb_resampling, discretization,
                               do_inference_exact, manager, metrics, writer,
                               file_csv, missing_pct, noise_label_pct,
                               noise_label_type, noise_label_prob,
                               min_ncc_s_param, max_ncc_s_param,
                               step_ncc_s_param):
    logger.info("Number interval for discreteness (%1d)." % discretization)
    for pct_training in np.arange(10, 100, 10):
        logger.info("Percentage of training set: %s.", pct_training)

        for resampling in range(nb_resampling):
            # Loading data set
            in_path_train = in_path % ("train", resampling + 1,
                                       int(pct_training))
            in_path_test = in_path % ("test", resampling + 1,
                                      int(pct_training))
            logger.info("Evaluate training/test data set: (%s, %s).",
                        in_path_train, in_path_test)
            data_training, nb_labels = init_dataset(in_path_train, None, False)
            data_test, _ = init_dataset(in_path_test, None, False)
            # discretization global data set
            data_training.data = [row + ["T"]
                                  for row in data_training.data]  # T: Training
            data_test.data = [row + ["V"]
                              for row in data_test.data]  # V: Validation
            data_global = ArffFile()
            data_global.attribute_data = data_training.attribute_data.copy()
            data_global.attribute_types = data_training.attribute_types.copy()
            data_global.data = copy.deepcopy(data_training.data +
                                             data_test.data)
            data_global.attributes = copy.copy(data_training.attributes)
            data_global.discretize(discmet="eqfreq", numint=discretization)
            # recopy the training discretized data set
            data_training.attribute_data = data_global.attribute_data.copy()
            data_training.attribute_types = data_global.attribute_types.copy()
            data_training.attributes = copy.copy(data_global.attributes)
            data_training.data = [
                row[:-1] for row in data_global.data if row[-1] == "T"
            ]
            # recopy the test discretized data set
            data_test.attribute_data = data_global.attribute_data.copy()
            data_test.attribute_types = data_global.attribute_types.copy()
            data_test.attributes = copy.copy(data_global.attributes)
            data_test.data = [
                row[:-1] for row in data_global.data if row[-1] == "V"
            ]

            # putting in 1 split training and test data set
            splits_s = [(data_training, data_test)]
            disc_re_pct = str(discretization) + "-" + str(
                resampling) + "-" + str(pct_training)
            metrics.init_sub_level(disc_re_pct)
            for s_ncc in np.arange(min_ncc_s_param, max_ncc_s_param,
                                   step_ncc_s_param):
                metrics.init_level_imprecision(str(s_ncc), disc_re_pct)
                # training and test a partition of sampling
                cv10fold_br_vs_ibr(splits_s, s_ncc, disc_re_pct, manager,
                                   metrics, logger, nb_labels,
                                   do_inference_exact, missing_pct,
                                   noise_label_pct, noise_label_type,
                                   noise_label_prob)

                _partial_saving = metrics.generate_row_line(
                    str(s_ncc), resampling, 1, sub_level=disc_re_pct)
                _partial_saving.insert(0, str(discretization))
                _partial_saving.insert(0, str(pct_training))
                writer.writerow(_partial_saving)
                file_csv.flush()
                logger.debug(
                    "Partial-ncc_step (disc, s, time, ich_skep, cph_skep, ich_out, "
                    "cph_out, acc, jacc, ich_reject, cph_reject, jacc_reject) (%s, %s, %s, %s)",
                    disc_re_pct, s_ncc, resampling, metrics)