Beispiel #1
0
def init_dataset(in_path, remove_features, scaling, nb_labels=None):
    data_learning = arff.ArffFile()
    data_learning.load(in_path)
    if remove_features is not None:
        for r_feature in remove_features:
            try:
                data_learning.remove_col(r_feature)
            except Exception as err:
                print("Remove feature error: {0}".format(err))
    if nb_labels is None:
        nb_labels = get_nb_labels_class(data_learning)
    if scaling:
        normalize(data_learning, n_labels=nb_labels)
    return data_learning, nb_labels
Beispiel #2
0
'''
Created on 25 august 2016

@author: Sebastien Destercke

About how to use the Naive Credal Classifier Binary Relevance.

'''

print("Example of multilabel prediciton with NCC BR - data set yeast \n")

print("Data loading \n")
from classifip.dataset import arff

data=arff.ArffFile()
data.load("yeast.arff")
dataset='yeast'
nblab=14

# We start by creating an instance of the base classifier we want to use
print("Model creation and learning \n")
from classifip.models import knnbr
model=knnbr.IPKNNBR()

# Learning
model.learn(data,nblab)

# Evaluation : we can set the parametersof the classifier
test = model.evaluate([row[0:len(row)-nblab] for row in data.data[0:10]],knnbr_beta=0.5)

# The output is a list of probability intervals, we can print each instance :
Beispiel #3
0
def computing_best_imprecise_mean(in_path=None,
                                  out_path=None,
                                  seed=None,
                                  nb_kFold=10,
                                  nb_process=1,
                                  scaling=True,
                                  max_ncc_s_param=5,
                                  remove_features=None):
    assert os.path.exists(in_path), "Without training data, not testing"
    assert os.path.exists(out_path), "File for putting results does not exist"

    logger = create_logger("computing_best_imprecise_mean_cv", True)
    logger.info('Training dataset (%s, %s)', in_path, out_path)

    # Seeding a random value for k-fold top learning-testing data
    if seed is not None: random.seed(seed)
    seed = [random.randrange(sys.maxsize) for _ in range(nb_kFold)]
    logger.debug("[FIRST-STEP-SEED] SEED: %s", seed)

    # Create a CSV file for saving results
    file_csv = open(out_path, 'a')
    writer = csv.writer(file_csv)
    manager = ManagerWorkers(nb_process=nb_process)
    manager.executeAsync(class_model="classifip.models.mlcncc.MLCNCC")

    ich, cph = dict(), dict()
    min_discretize, max_discretize = 5, 9
    for nb_disc in range(min_discretize, max_discretize):
        data_learning = arff.ArffFile()
        data_learning.load(in_path)
        if remove_features is not None:
            for r_feature in remove_features:
                data_learning.remove_col(r_feature)
        nb_labels = get_nb_labels_class(data_learning)
        if scaling: normalize(data_learning, n_labels=nb_labels)
        data_learning.discretize(discmet="eqfreq", numint=nb_disc)

        for time in range(nb_kFold):  # 10-10 times cross-validation
            logger.info(
                "Number interval for discreteness and labels (%1d, %1d)." %
                (nb_disc, nb_labels))
            cv_kfold = k_fold_cross_validation(data_learning,
                                               nb_kFold,
                                               randomise=True,
                                               random_seed=seed[time])

            splits_s = list([])
            for training, testing in cv_kfold:
                splits_s.append((training, testing))
                logger.info("Splits %s train %s", len(training.data),
                            training.data[0])
                logger.info("Splits %s test %s", len(testing.data),
                            testing.data[0])

            disc = str(nb_disc) + "-" + str(time)
            ich[disc], cph[disc] = dict(), dict()
            for s_ncc in np.arange(0.1, max_ncc_s_param + 1, 1):
                ks_ncc = str(s_ncc)
                ich[disc][ks_ncc], cph[disc][ks_ncc] = 0, 0
                for idx_fold, (training, testing) in enumerate(splits_s):
                    ich[disc][ks_ncc], cph[disc][
                        ks_ncc] = computing_training_testing_step(
                            training, testing, nb_labels, s_ncc, manager,
                            ich[disc][ks_ncc], cph[disc][ks_ncc])

                writer.writerow([
                    str(nb_disc), s_ncc, time, ich[disc][ks_ncc] / nb_kFold,
                    cph[disc][ks_ncc] / nb_kFold
                ])
                file_csv.flush()
                logger.debug("Partial-s-k_step (%s, %s, %s, %s, %s)", disc,
                             s_ncc, time, ich[disc][ks_ncc] / nb_kFold,
                             cph[disc][ks_ncc] / nb_kFold)
    manager.poisonPillTraining()
    file_csv.close()
    logger.debug("Results Final: %s, %s", ich, cph)
Beispiel #4
0
'''
Created on 20 may 2019

@author: Yonatan-Carlos Carranza-Alarcon

About how to use the Imprecise multilabel chaining
'''
from classifip.dataset import arff
from classifip.models.mlc.chainncc import MLChaining
import timeit

dataArff = arff.ArffFile()
dataArff.load("emotions.arff")
dataArff.discretize(discmet='eqfreq', numint=5)
nb_labels = 5

# Test instances
new_instances = [row[0:len(row) - nb_labels] for row in dataArff.data[10:11]]

# We start by creating a model
model = MLChaining()
model.learn(dataArff, nb_labels)

probabilities, chain = model.evaluate(new_instances,
                                      ncc_epsilon=0.001,
                                      ncc_s_param=1)

print(
    "Probability intervals obtained for each label on the first test instance \n"
)
print(probabilities[0])
Beispiel #5
0
def experiments_chaining_imprecise(
        in_path=None,
        out_path=None,
        seed=None,
        nb_kFold=10,
        nb_process=1,
        min_ncc_s_param=0.5,
        max_ncc_s_param=6.0,
        step_ncc_s_param=1.0,
        missing_pct=0.0,
        noise_label_pct=0.0,
        noise_label_type=-1,
        noise_label_prob=0.5,
        remove_features=None,
        scaling=False,
        strategy_chaining=IMLCStrategy.IMPRECISE_BRANCHING,
        safety_chaining=False):
    assert os.path.exists(in_path), "Without training data, not testing"
    assert os.path.exists(out_path), "File for putting results does not exist"

    logger = create_logger("computing_best_imprecise_mean", True)
    logger.info('Training dataset (%s, %s)', in_path, out_path)
    logger.info(
        "(min_ncc_s_param, max_ncc_s_param, step_ncc_s_param) (%s, %s, %s)",
        min_ncc_s_param, max_ncc_s_param, step_ncc_s_param)
    logger.info("(scaling, remove_features, process) (%s, %s, %s)", scaling,
                remove_features, nb_process)
    logger.info(
        "(missing_pct, noise_label_pct, noise_label_type, noise_label_prob) (%s, %s, %s, %s)",
        missing_pct, noise_label_pct, noise_label_type, noise_label_prob)
    logger.info("(strategy_chaining, safety_chaining) (%s, %s)",
                strategy_chaining, safety_chaining)

    # Seeding a random value for k-fold top learning-testing data
    if seed is None:
        seed = [random.randrange(sys.maxsize) for _ in range(nb_kFold)]
    logger.debug("[FIRST-STEP-SEED] SEED: %s", seed)

    # Create a CSV file for saving results
    file_csv = open(out_path, 'a')
    writer = csv.writer(file_csv)
    manager = ManagerWorkers(nb_process=nb_process)
    manager.executeAsync(
        class_model="classifip.models.mlc.chainncc.MLChaining")

    ich, cph, acc, acc_trans, avg_sols = dict(), dict(), dict(), dict(), dict()
    min_discretize, max_discretize = 5, 7
    for nb_disc in range(min_discretize, max_discretize):
        data_learning = arff.ArffFile()
        data_learning.load(in_path)
        if remove_features is not None:
            for r_feature in remove_features:
                try:
                    data_learning.remove_col(r_feature)
                except Exception as err:
                    print("Remove feature error: {0}".format(err))
        nb_labels = get_nb_labels_class(data_learning)
        if scaling:
            normalize(data_learning, n_labels=nb_labels)
        data_learning.discretize(discmet="eqfreq", numint=nb_disc)

        for time in range(nb_kFold):  # 10-10 times cross-validation
            logger.info(
                "Number interval for discreteness and labels (%1d, %1d)." %
                (nb_disc, nb_labels))
            cv_kfold = k_fold_cross_validation(data_learning,
                                               nb_kFold,
                                               randomise=True,
                                               random_seed=seed[time])

            splits_s = list([])
            for training, testing in cv_kfold:
                train_clone_data = training.make_clone()
                test_clone_data = testing.make_clone()
                MLCNCC.shuffle_labels_train_testing(train_clone_data,
                                                    test_clone_data,
                                                    nb_labels=nb_labels)
                logger.info("Splits %s train %s", len(training.data),
                            training.data[0])
                logger.info("Splits %s test %s", len(testing.data),
                            testing.data[0])
                splits_s.append((train_clone_data, test_clone_data))

            disc = str(nb_disc) + "-" + str(time)
            ich[disc], cph[disc] = dict(), dict()
            acc_trans[disc], acc[disc] = dict(), dict()
            avg_sols[disc] = dict()
            for s_ncc in np.arange(min_ncc_s_param, max_ncc_s_param,
                                   step_ncc_s_param):
                ks_ncc = str(s_ncc)
                ich[disc][ks_ncc], cph[disc][ks_ncc] = 0, 0
                acc[disc][ks_ncc], acc_trans[disc][ks_ncc] = 0, 0
                avg_sols[disc][ks_ncc] = 0
                for idx_fold, (training, testing) in enumerate(splits_s):
                    res = computing_training_testing_step(
                        training, testing, nb_labels, s_ncc, manager,
                        strategy_chaining, safety_chaining, missing_pct,
                        noise_label_pct, noise_label_type, noise_label_prob,
                        ich[disc][ks_ncc], cph[disc][ks_ncc],
                        acc[disc][ks_ncc], acc_trans[disc][ks_ncc],
                        avg_sols[disc][ks_ncc])
                    ich[disc][ks_ncc], cph[disc][ks_ncc] = res[0], res[1]
                    acc[disc][ks_ncc], acc_trans[disc][ks_ncc] = res[2], res[3]
                    avg_sols[disc][ks_ncc] = res[4]
                    logger.debug(
                        "Partial-step-cumulative (acc, ich, acc_trans, avg_sols) (%s, %s, %s, %s)",
                        acc[disc][ks_ncc], ich[disc][ks_ncc],
                        acc_trans[disc][ks_ncc], avg_sols[disc][ks_ncc])
                writer.writerow([
                    str(nb_disc), s_ncc, time, ich[disc][ks_ncc] / nb_kFold,
                    cph[disc][ks_ncc] / nb_kFold, acc[disc][ks_ncc] / nb_kFold,
                    acc_trans[disc][ks_ncc] / nb_kFold,
                    avg_sols[disc][ks_ncc] / nb_kFold
                ])
                file_csv.flush()
                logger.debug("Partial-s-k_step (%s, %s, %s, %s, %s, %s)", disc,
                             s_ncc, time, ich[disc][ks_ncc] / nb_kFold,
                             cph[disc][ks_ncc] / nb_kFold,
                             acc_trans[disc][ks_ncc] / nb_kFold)
    manager.poisonPillTraining()
    file_csv.close()
    logger.debug("Results Final: %s, %s", ich, cph)