def re_sampling_with_pct_train(logger, manager, metrics, nb_resampling, writer, file_csv, min_ell_param, max_ell_param, step_ell_param, missing_pct, noise_label_pct, noise_label_type, noise_label_prob, save_query): for resampling in range(nb_resampling): for pct_training in np.arange(10, 100, 10): logger.info("Percentage/Resampling of training set: %s, %s.", pct_training, resampling) # Loading data set in_path_train = in_path % ("train", resampling + 1, int(pct_training)) in_path_test = in_path % ("test", resampling + 1, int(pct_training)) logger.info("Evaluate training/test data set: (%s, %s).", in_path_train, in_path_test) data_training, nb_labels = init_dataset(in_path_train, None, False) data_test, _ = init_dataset(in_path_test, None, False) # putting in 1 split training and test data set splits_s = [(data_training, data_test)] re_pct = str(resampling) + "-" + str(pct_training) metrics.init_sub_level(re_pct) for ell_imprecision in np.arange(min_ell_param, max_ell_param, step_ell_param): metrics.init_level_imprecision(str(ell_imprecision), re_pct) save_prediction = save_query(pct_training, ell_imprecision, resampling) # training and test a partition of sampling cv10fold_br_vs_ibr(logger, splits_s, ell_imprecision, nb_labels, manager, metrics, re_pct, missing_pct, noise_label_pct, noise_label_type, noise_label_prob, save_prediction) _partial_saving = metrics.generate_row_line(str(ell_imprecision), resampling, 1, sub_level=re_pct) _partial_saving.insert(0, str(pct_training)) writer.writerow(_partial_saving) file_csv.flush() logger.debug("Partial-ncc_step (re_pct, s, time, ich_skep, cph_skep, ich_out, " "cph_out, acc, jacc, ich_reject, cph_reject, jacc_reject) (%s, %s, %s, %s)", re_pct, ell_imprecision, resampling, metrics)
def cv10x10fold_br_vs_ibr(logger, manager, metrics, remove_features, scaling, nb_kFold, seed, writer, file_csv, min_ell_param, max_ell_param, step_ell_param, missing_pct, noise_label_pct, noise_label_type, noise_label_prob): data_learning, nb_labels = init_dataset(in_path, remove_features, scaling) for time in range(nb_kFold): # 10-10 times cross-validation logger.info("Number labels %s", nb_labels) cv_kfold = k_fold_cross_validation(data_learning, K=nb_kFold, randomise=True, random_seed=seed[time]) splits_s = list([]) for training, testing in cv_kfold: # making a clone because it send the same address memory splits_s.append((training.make_clone(), testing.make_clone())) logger.info("Splits %s train %s", len(training.data), training.data[0][1:4]) logger.info("Splits %s test %s", len(testing.data), testing.data[0][1:4]) for ell_imprecision in np.arange(min_ell_param, max_ell_param, step_ell_param): str_ell = str(ell_imprecision) metrics.init_level_imprecision(str_ell) cv10fold_br_vs_ibr(logger, splits_s, ell_imprecision, nb_labels, manager, metrics, None, missing_pct, noise_label_pct, noise_label_type, noise_label_prob) _partial_saving = metrics.generate_row_line(str_ell, time, nb_kFold) writer.writerow(_partial_saving) file_csv.flush() logger.debug("Partial-s-k_step (ell, time, ich_skep, cph_skep, acc, " "ich_reject, cph_reject) (%s, %s, %s)", ell_imprecision, time, metrics)
def re_sampling_with_pct_train(logger, in_path, nb_resampling, file_csv, writer, metrics, manager, missing_pct, noise_label_pct, noise_label_type, noise_label_prob): for pct_training in np.arange(10, 100, 10): logger.info("Percentage of training set: %s.", pct_training) re_pct = str(pct_training) metrics.init_sub_level(re_pct) for resampling in range(nb_resampling): # Loading data set in_path_train = in_path % ("train", resampling + 1, int(pct_training)) in_path_test = in_path % ("test", resampling + 1, int(pct_training)) logger.info("Evaluate training/test data set: (%s, %s).", in_path_train, in_path_test) data_training, nb_labels = init_dataset(in_path_train, None, False) data_test, _ = init_dataset(in_path_test, None, False) # putting in 1 split training and test data set splits_s = [(data_training, data_test)] # level gamma hyper-parameter metrics.init_level_imprecision(str(resampling), sub_level=re_pct) cv10fold_br_vs_ibr(logger, splits_s, resampling, nb_labels, manager, metrics, re_pct, missing_pct, noise_label_pct, noise_label_type, noise_label_prob) _partial_saving = metrics.generate_row_line(str(resampling), resampling, 1, sub_level=re_pct) del _partial_saving[1] # remove resampling replicate _partial_saving.insert(0, re_pct) writer.writerow(_partial_saving) file_csv.flush() logger.debug( "Partial-s-k_step (time, hamming, ich_skep) (%s, %s, %s)", resampling, metrics.score_hamming, metrics.ich_iid_skeptic)
def cv10x10fold_br_vs_ibr(in_path, remove_features, scaling, logger, nb_kFold, discretization, do_inference_exact, manager, metrics, writer, seed, file_csv, missing_pct, noise_label_pct, noise_label_type, noise_label_prob, min_ncc_s_param, max_ncc_s_param, step_ncc_s_param): data_learning, nb_labels = init_dataset(in_path, remove_features, scaling) data_learning.discretize(discmet="eqfreq", numint=discretization) for time in range(nb_kFold): # 10-10 times cross-validation logger.info("Number interval for discreteness and labels (%1d, %1d)." % (discretization, nb_labels)) cv_kfold = k_fold_cross_validation(data_learning, nb_kFold, randomise=True, random_seed=seed[time]) splits_s = list([]) for training, testing in cv_kfold: # making a clone because it send the same address memory splits_s.append((training.make_clone(), testing.make_clone())) logger.info("Splits %s train %s", len(training.data), training.data[0][1:4]) logger.info("Splits %s test %s", len(testing.data), testing.data[0][1:4]) disc_time = str(discretization) + "-" + str(time) metrics.init_sub_level(disc_time) for s_ncc in np.arange(min_ncc_s_param, max_ncc_s_param, step_ncc_s_param): str_s_ncc = str(s_ncc) metrics.init_level_imprecision(str_s_ncc, disc_time) cv10fold_br_vs_ibr(splits_s, s_ncc, disc_time, manager, metrics, logger, nb_labels, do_inference_exact, missing_pct, noise_label_pct, noise_label_type, noise_label_prob) _partial_saving = metrics.generate_row_line(str_s_ncc, time, nb_kFold, sub_level=disc_time) _partial_saving.insert(0, str(discretization)) writer.writerow(_partial_saving) file_csv.flush() logger.debug( "Partial-ncc_step (disc, s, time, ich_skep, cph_skep, ich_out, " "cph_out, acc, jacc, ich_reject, cph_reject, jacc_reject) (%s, %s, %s, %s)", disc_time, s_ncc, time, metrics)
def create_dataset_by_percentage(in_path, out_path, pct_test, nb_samplings, dataset="emotions", scaling=True, nb_labels=None): seeds = generate_seeds(nb_samplings) data_learning, _ = init_dataset(in_path, remove_features=[], scaling=scaling, nb_labels=nb_labels) print("[SEEDs-GENERATION-PCT-TEST] ", pct_test, seeds, flush=True) for i in range(nb_samplings): str_pct_training = str(int(round((1 - pct_test) * 100))) file_test_name, file_train_name = generate_name( dataset, str(i + 1), str_pct_training) saving_data_sets(data_learning, out_path, pct_test, file_train_name, file_test_name, seeds[i], i)
def re_sampling_with_pct_train(in_path, logger, nb_resampling, discretization, do_inference_exact, manager, metrics, writer, file_csv, missing_pct, noise_label_pct, noise_label_type, noise_label_prob, min_ncc_s_param, max_ncc_s_param, step_ncc_s_param): logger.info("Number interval for discreteness (%1d)." % discretization) for pct_training in np.arange(10, 100, 10): logger.info("Percentage of training set: %s.", pct_training) for resampling in range(nb_resampling): # Loading data set in_path_train = in_path % ("train", resampling + 1, int(pct_training)) in_path_test = in_path % ("test", resampling + 1, int(pct_training)) logger.info("Evaluate training/test data set: (%s, %s).", in_path_train, in_path_test) data_training, nb_labels = init_dataset(in_path_train, None, False) data_test, _ = init_dataset(in_path_test, None, False) # discretization global data set data_training.data = [row + ["T"] for row in data_training.data] # T: Training data_test.data = [row + ["V"] for row in data_test.data] # V: Validation data_global = ArffFile() data_global.attribute_data = data_training.attribute_data.copy() data_global.attribute_types = data_training.attribute_types.copy() data_global.data = copy.deepcopy(data_training.data + data_test.data) data_global.attributes = copy.copy(data_training.attributes) data_global.discretize(discmet="eqfreq", numint=discretization) # recopy the training discretized data set data_training.attribute_data = data_global.attribute_data.copy() data_training.attribute_types = data_global.attribute_types.copy() data_training.attributes = copy.copy(data_global.attributes) data_training.data = [ row[:-1] for row in data_global.data if row[-1] == "T" ] # recopy the test discretized data set data_test.attribute_data = data_global.attribute_data.copy() data_test.attribute_types = data_global.attribute_types.copy() data_test.attributes = copy.copy(data_global.attributes) data_test.data = [ row[:-1] for row in data_global.data if row[-1] == "V" ] # putting in 1 split training and test data set splits_s = [(data_training, data_test)] disc_re_pct = str(discretization) + "-" + str( resampling) + "-" + str(pct_training) metrics.init_sub_level(disc_re_pct) for s_ncc in np.arange(min_ncc_s_param, max_ncc_s_param, step_ncc_s_param): metrics.init_level_imprecision(str(s_ncc), disc_re_pct) # training and test a partition of sampling cv10fold_br_vs_ibr(splits_s, s_ncc, disc_re_pct, manager, metrics, logger, nb_labels, do_inference_exact, missing_pct, noise_label_pct, noise_label_type, noise_label_prob) _partial_saving = metrics.generate_row_line( str(s_ncc), resampling, 1, sub_level=disc_re_pct) _partial_saving.insert(0, str(discretization)) _partial_saving.insert(0, str(pct_training)) writer.writerow(_partial_saving) file_csv.flush() logger.debug( "Partial-ncc_step (disc, s, time, ich_skep, cph_skep, ich_out, " "cph_out, acc, jacc, ich_reject, cph_reject, jacc_reject) (%s, %s, %s, %s)", disc_re_pct, s_ncc, resampling, metrics)