def init_dataset(in_path, remove_features, scaling, nb_labels=None): data_learning = arff.ArffFile() data_learning.load(in_path) if remove_features is not None: for r_feature in remove_features: try: data_learning.remove_col(r_feature) except Exception as err: print("Remove feature error: {0}".format(err)) if nb_labels is None: nb_labels = get_nb_labels_class(data_learning) if scaling: normalize(data_learning, n_labels=nb_labels) return data_learning, nb_labels
''' Created on 25 august 2016 @author: Sebastien Destercke About how to use the Naive Credal Classifier Binary Relevance. ''' print("Example of multilabel prediciton with NCC BR - data set yeast \n") print("Data loading \n") from classifip.dataset import arff data=arff.ArffFile() data.load("yeast.arff") dataset='yeast' nblab=14 # We start by creating an instance of the base classifier we want to use print("Model creation and learning \n") from classifip.models import knnbr model=knnbr.IPKNNBR() # Learning model.learn(data,nblab) # Evaluation : we can set the parametersof the classifier test = model.evaluate([row[0:len(row)-nblab] for row in data.data[0:10]],knnbr_beta=0.5) # The output is a list of probability intervals, we can print each instance :
def computing_best_imprecise_mean(in_path=None, out_path=None, seed=None, nb_kFold=10, nb_process=1, scaling=True, max_ncc_s_param=5, remove_features=None): assert os.path.exists(in_path), "Without training data, not testing" assert os.path.exists(out_path), "File for putting results does not exist" logger = create_logger("computing_best_imprecise_mean_cv", True) logger.info('Training dataset (%s, %s)', in_path, out_path) # Seeding a random value for k-fold top learning-testing data if seed is not None: random.seed(seed) seed = [random.randrange(sys.maxsize) for _ in range(nb_kFold)] logger.debug("[FIRST-STEP-SEED] SEED: %s", seed) # Create a CSV file for saving results file_csv = open(out_path, 'a') writer = csv.writer(file_csv) manager = ManagerWorkers(nb_process=nb_process) manager.executeAsync(class_model="classifip.models.mlcncc.MLCNCC") ich, cph = dict(), dict() min_discretize, max_discretize = 5, 9 for nb_disc in range(min_discretize, max_discretize): data_learning = arff.ArffFile() data_learning.load(in_path) if remove_features is not None: for r_feature in remove_features: data_learning.remove_col(r_feature) nb_labels = get_nb_labels_class(data_learning) if scaling: normalize(data_learning, n_labels=nb_labels) data_learning.discretize(discmet="eqfreq", numint=nb_disc) for time in range(nb_kFold): # 10-10 times cross-validation logger.info( "Number interval for discreteness and labels (%1d, %1d)." % (nb_disc, nb_labels)) cv_kfold = k_fold_cross_validation(data_learning, nb_kFold, randomise=True, random_seed=seed[time]) splits_s = list([]) for training, testing in cv_kfold: splits_s.append((training, testing)) logger.info("Splits %s train %s", len(training.data), training.data[0]) logger.info("Splits %s test %s", len(testing.data), testing.data[0]) disc = str(nb_disc) + "-" + str(time) ich[disc], cph[disc] = dict(), dict() for s_ncc in np.arange(0.1, max_ncc_s_param + 1, 1): ks_ncc = str(s_ncc) ich[disc][ks_ncc], cph[disc][ks_ncc] = 0, 0 for idx_fold, (training, testing) in enumerate(splits_s): ich[disc][ks_ncc], cph[disc][ ks_ncc] = computing_training_testing_step( training, testing, nb_labels, s_ncc, manager, ich[disc][ks_ncc], cph[disc][ks_ncc]) writer.writerow([ str(nb_disc), s_ncc, time, ich[disc][ks_ncc] / nb_kFold, cph[disc][ks_ncc] / nb_kFold ]) file_csv.flush() logger.debug("Partial-s-k_step (%s, %s, %s, %s, %s)", disc, s_ncc, time, ich[disc][ks_ncc] / nb_kFold, cph[disc][ks_ncc] / nb_kFold) manager.poisonPillTraining() file_csv.close() logger.debug("Results Final: %s, %s", ich, cph)
''' Created on 20 may 2019 @author: Yonatan-Carlos Carranza-Alarcon About how to use the Imprecise multilabel chaining ''' from classifip.dataset import arff from classifip.models.mlc.chainncc import MLChaining import timeit dataArff = arff.ArffFile() dataArff.load("emotions.arff") dataArff.discretize(discmet='eqfreq', numint=5) nb_labels = 5 # Test instances new_instances = [row[0:len(row) - nb_labels] for row in dataArff.data[10:11]] # We start by creating a model model = MLChaining() model.learn(dataArff, nb_labels) probabilities, chain = model.evaluate(new_instances, ncc_epsilon=0.001, ncc_s_param=1) print( "Probability intervals obtained for each label on the first test instance \n" ) print(probabilities[0])
def experiments_chaining_imprecise( in_path=None, out_path=None, seed=None, nb_kFold=10, nb_process=1, min_ncc_s_param=0.5, max_ncc_s_param=6.0, step_ncc_s_param=1.0, missing_pct=0.0, noise_label_pct=0.0, noise_label_type=-1, noise_label_prob=0.5, remove_features=None, scaling=False, strategy_chaining=IMLCStrategy.IMPRECISE_BRANCHING, safety_chaining=False): assert os.path.exists(in_path), "Without training data, not testing" assert os.path.exists(out_path), "File for putting results does not exist" logger = create_logger("computing_best_imprecise_mean", True) logger.info('Training dataset (%s, %s)', in_path, out_path) logger.info( "(min_ncc_s_param, max_ncc_s_param, step_ncc_s_param) (%s, %s, %s)", min_ncc_s_param, max_ncc_s_param, step_ncc_s_param) logger.info("(scaling, remove_features, process) (%s, %s, %s)", scaling, remove_features, nb_process) logger.info( "(missing_pct, noise_label_pct, noise_label_type, noise_label_prob) (%s, %s, %s, %s)", missing_pct, noise_label_pct, noise_label_type, noise_label_prob) logger.info("(strategy_chaining, safety_chaining) (%s, %s)", strategy_chaining, safety_chaining) # Seeding a random value for k-fold top learning-testing data if seed is None: seed = [random.randrange(sys.maxsize) for _ in range(nb_kFold)] logger.debug("[FIRST-STEP-SEED] SEED: %s", seed) # Create a CSV file for saving results file_csv = open(out_path, 'a') writer = csv.writer(file_csv) manager = ManagerWorkers(nb_process=nb_process) manager.executeAsync( class_model="classifip.models.mlc.chainncc.MLChaining") ich, cph, acc, acc_trans, avg_sols = dict(), dict(), dict(), dict(), dict() min_discretize, max_discretize = 5, 7 for nb_disc in range(min_discretize, max_discretize): data_learning = arff.ArffFile() data_learning.load(in_path) if remove_features is not None: for r_feature in remove_features: try: data_learning.remove_col(r_feature) except Exception as err: print("Remove feature error: {0}".format(err)) nb_labels = get_nb_labels_class(data_learning) if scaling: normalize(data_learning, n_labels=nb_labels) data_learning.discretize(discmet="eqfreq", numint=nb_disc) for time in range(nb_kFold): # 10-10 times cross-validation logger.info( "Number interval for discreteness and labels (%1d, %1d)." % (nb_disc, nb_labels)) cv_kfold = k_fold_cross_validation(data_learning, nb_kFold, randomise=True, random_seed=seed[time]) splits_s = list([]) for training, testing in cv_kfold: train_clone_data = training.make_clone() test_clone_data = testing.make_clone() MLCNCC.shuffle_labels_train_testing(train_clone_data, test_clone_data, nb_labels=nb_labels) logger.info("Splits %s train %s", len(training.data), training.data[0]) logger.info("Splits %s test %s", len(testing.data), testing.data[0]) splits_s.append((train_clone_data, test_clone_data)) disc = str(nb_disc) + "-" + str(time) ich[disc], cph[disc] = dict(), dict() acc_trans[disc], acc[disc] = dict(), dict() avg_sols[disc] = dict() for s_ncc in np.arange(min_ncc_s_param, max_ncc_s_param, step_ncc_s_param): ks_ncc = str(s_ncc) ich[disc][ks_ncc], cph[disc][ks_ncc] = 0, 0 acc[disc][ks_ncc], acc_trans[disc][ks_ncc] = 0, 0 avg_sols[disc][ks_ncc] = 0 for idx_fold, (training, testing) in enumerate(splits_s): res = computing_training_testing_step( training, testing, nb_labels, s_ncc, manager, strategy_chaining, safety_chaining, missing_pct, noise_label_pct, noise_label_type, noise_label_prob, ich[disc][ks_ncc], cph[disc][ks_ncc], acc[disc][ks_ncc], acc_trans[disc][ks_ncc], avg_sols[disc][ks_ncc]) ich[disc][ks_ncc], cph[disc][ks_ncc] = res[0], res[1] acc[disc][ks_ncc], acc_trans[disc][ks_ncc] = res[2], res[3] avg_sols[disc][ks_ncc] = res[4] logger.debug( "Partial-step-cumulative (acc, ich, acc_trans, avg_sols) (%s, %s, %s, %s)", acc[disc][ks_ncc], ich[disc][ks_ncc], acc_trans[disc][ks_ncc], avg_sols[disc][ks_ncc]) writer.writerow([ str(nb_disc), s_ncc, time, ich[disc][ks_ncc] / nb_kFold, cph[disc][ks_ncc] / nb_kFold, acc[disc][ks_ncc] / nb_kFold, acc_trans[disc][ks_ncc] / nb_kFold, avg_sols[disc][ks_ncc] / nb_kFold ]) file_csv.flush() logger.debug("Partial-s-k_step (%s, %s, %s, %s, %s, %s)", disc, s_ncc, time, ich[disc][ks_ncc] / nb_kFold, cph[disc][ks_ncc] / nb_kFold, acc_trans[disc][ks_ncc] / nb_kFold) manager.poisonPillTraining() file_csv.close() logger.debug("Results Final: %s, %s", ich, cph)