def threshold_determination(self, context, classifier_name,
                                patterns_outputs):
        """
            With the discretized outputs for roc values, determine the best values for the threshold.
            """
        statistics_class = Statistics()
        #Aux structures
        threshold_list = AutoVivification()
        minimum_error = AutoVivification()

        for class_text in context["classifiers"][classifier_name][
                "classes_names"]:
            #Initialize the aux structures
            threshold_list[class_text] = []
            minimum_error[class_text] = float('inf')
            self.info[classifier_name][class_text]["threshold"][
                "medium"] = float('inf')
            self.info[classifier_name][class_text]["threshold"][
                "minimum"] = float('inf')
            self.info[classifier_name][class_text]["threshold"][
                "maximum"] = float('-inf')
            #For each value of threshold generated
        for threshold in self.info[classifier_name]["roc_outputs"]:
            #Calculate the goodness of the classifier
            statistics_class.goodness(
                context, classifier_name,
                self.info[classifier_name]["roc_outputs"][threshold],
                patterns_outputs)
            for class_text in context["classifiers"][classifier_name][
                    "classes_names"]:
                error = 0.0
                for function in context["classifiers"][classifier_name][
                        "thresholds"]["metric"]:
                    getattr(statistics_class,
                            function)(classifier_name, context, self,
                                      "validation")
                    error += statistics_class.measures[classifier_name][
                        class_text][function]
                #If we find a minimum error, we save it
                if error < minimum_error[class_text]:
                    minimum_error[class_text] = error
                    threshold_list[class_text] = [threshold]
                    #When we find a new global minimum we have to reset the list
                    #And save it again
                    #If there is a tie in terms of goodness, save all the range of values with the minimum error
                if error == minimum_error[class_text]:
                    threshold_list[class_text].append(threshold)
                    #Determine different kinds of thresholds

                if len(threshold_list[class_text]) == 0:
                    raise ValueError("There is no threshold selected")
        return threshold_list
Beispiel #2
0
    def classes_counter_indexes(context, data_set):

        classes_counter = AutoVivification()
        classes_indexes = AutoVivification()
        classes_texts = context["classifiers"][context["classifier_list"]
                                               [0]]["classes_names"]
        len_inputs = len(data_set[0]) - len(classes_texts)

        for class_text in classes_texts:
            column = [
                data_set[i][len_inputs + classes_texts.index(class_text)]
                for i in range(len(data_set))
            ]
            classes_counter[class_text] = np.sum(column)
            classes_indexes[class_text] = column

        return classes_counter, classes_indexes
def structure_combined_features():
    from mullpy.auxiliar import AutoVivification

    structure = AutoVivification()
    i = 0
    for amount in range(2, 5 + 1):
        temporal = list(
            itertools.combinations([
                "AGE", "EDUC", "LIMMTOTAL", "FAQ", "MMSE", "GDS", "LDELTOTAL"
            ], amount))
        for t in temporal:
            structure[i] = list(t)
            i += 1
    return structure
def select_best_configuration_each_combination(in_file, out_file):
    # Sólo muestra la mejor configuración para cada combinación
    f = open(in_file)
    f2 = open(out_file, "w")
    lines = f.readlines()
    resultados = AutoVivification()
    for line in lines:
        resultados[line[:line.find(":")]] = line[line.find("\t") + 1:]

    temp = []
    for classifier_name in reversed(
        [x for x in sorted(resultados.keys(), key=lambda y: resultados[y])]):
        res = re.search(r'[0-9]+', classifier_name[:classifier_name.find("_")])
        nombre = classifier_name[res.start():res.end()]
        if nombre not in temp:
            f2.write(classifier_name + ":\t")
            f2.write("%.4f\n" % (float(resultados[classifier_name])))
            temp.append(nombre)
    f.close()
    f2.close()
    def __init__(self, context, ensemble_name, information, pattern_kind_list):
        """
        Complete the Information class with the ensembles decisions
        Self.info as a AutoVivification class might contain only ensemble internal information
        Build real and discretized outputs of the Ensemble, depending of the Ensemble kind.
        """
        self.info = AutoVivification()
        self.weights = None
        self.determine_ensemble_threshold(context, ensemble_name)

        for pattern_kind in pattern_kind_list:
            self._init_decision_matrix(context, ensemble_name, pattern_kind)
            self._build_decision_matrix(context, ensemble_name, information,
                                        pattern_kind)
            if nested_dict_access(
                ["classifiers", ensemble_name, "meta_learner"], context):
                self.meta_learner(context, ensemble_name, information)
            else:
                self._schedule_decisions(context, ensemble_name, information,
                                         pattern_kind)
Beispiel #6
0
    def random_distribution(self, context):
        """
        Bagging methods come in many flavours but mostly differ from each other by the way they draw random subsets
         of the training set:

        -When random subsets of the dataset are drawn as random subsets of the samples, then this algorithm is known
        as Pasting Rvotes.
        -When samples are drawn with replacement, then the method is known as Bagging.
        -When random subsets of the dataset are drawn as random subsets of the features, then the method is known as
        Random Subspaces.
        -When base estimators are built on subsets of both samples and features, then the method is known as Random
        Patches.

        group_successive variable groups each X instances. Each of these successive instances has to be together in
        the sampling process
        """
        total_length = 0
        lengths = AutoVivification()
        for pattern_kind in context["patterns"].patterns[
                context["classifier_list"][0]]:
            lengths[pattern_kind] = len(context["patterns"].patterns[
                context["classifier_list"][0]][pattern_kind])
            total_length += lengths[pattern_kind]

        #Check if the length of patterns have the same size
        for classifier_name in context["classifier_list"]:
            for pattern_kind in context["patterns"].patterns[classifier_name]:
                if len(context["patterns"].patterns[classifier_name]
                       [pattern_kind]) != lengths[pattern_kind]:
                    raise ValueError(
                        'The length of the %s pattern of classifier %s has different size from others'
                        % pattern_kind, classifier_name)

        if context["preprocess"]["random_distribution"]["group_successive"]:
            total_length = int(total_length / context["preprocess"]
                               ["random_distribution"]["group_successive"])
            for pattern_kind in lengths:
                lengths[pattern_kind] = int(
                    lengths[pattern_kind] / context["preprocess"]
                    ["random_distribution"]["group_successive"])

        dir_name = context["general_path"] + "patterns/" + context[
            "classifiers"][context["classifier_list"][0]]["set"]
        filters = AutoVivification()
        ###Specific kind of sampling###
        #############
        ######BAGGING
        #############
        if "bagging" in context["preprocess"]["random_distribution"] and \
                context["preprocess"]["random_distribution"]["bagging"]["activate"]:
            for pattern_kind in context["patterns_texts"]:
                filters[pattern_kind] = []
            self.bagging(context, filters, lengths, total_length)
            dir_name += "_bagging/"
        #############
        ######PASTING
        #############
        elif "pasting_Rvotes" in context["preprocess"]["random_distribution"] and \
                context["preprocess"]["random_distribution"]["pasting_Rvotes"]["activate"]:
            for pattern_kind in context["patterns_texts"]:
                filters[pattern_kind] = []
            self.pasting_rvotes(context, filters, lengths, total_length)
            dir_name += "_pasting_Rvotes/"
        #################
        #RANDOM SUBSPACES
        #################
        elif "random_subspaces" in context["preprocess"]["random_distribution"] and \
                context["preprocess"]["random_distribution"]["random_subspaces"]["activate"]:
            features_amount = self.check_features_amount(context)
            for pattern_kind in context["patterns_texts"]:
                filters[pattern_kind] = []
            self.random_subspaces(context, filters, features_amount)
            dir_name += "_random_subspaces/"
        #############
        #COMBINATIONS
        #############
        elif "all_features_combination" in context["preprocess"]["random_distribution"] and \
                context["preprocess"]["random_distribution"]["all_features_combination"]["activate"]:
            features_amount = self.check_features_amount(context)
            for pattern_kind in context["patterns_texts"]:
                filters[pattern_kind] = []
            self.all_features_combination(context, filters, features_amount)
            dir_name += "_features_combination/"
            context["preprocess"]["random_distribution"][
                "number_base_classifiers"] = len(filters["learning"])
        ###############
        #RANDOM PATCHES
        ###############
        elif "random_patches" in context["preprocess"]["random_distribution"] and \
                context["preprocess"]["random_distribution"]["random_patches"]["activate"]:
            dir_name += "_random_patches/"
        ###############
        #K-FOLD
        ###############
        elif "k_fold" in context["preprocess"]["random_distribution"] and \
                context["preprocess"]["random_distribution"]["k_fold"]["activate"]:
            for pattern_kind in context["preprocess"]["random_distribution"][
                    "k_fold"]["percents"]:
                filters[pattern_kind] = []
            self.k_fold(context, filters)
            dir_name += "_k_fold/"
        ###############
        #Forecasting distribution
        ###############
        elif "forecasting_distribution" in context["preprocess"]["random_distribution"] and \
                context["preprocess"]["random_distribution"]["forecasting_distribution"]["activate"]:
            self.forecasting_distribution(context, filters)
            dir_name += "_walking_forward/"

            ###Common functions###
        elif "bagging" in context["preprocess"]["random_distribution"] and \
                context["preprocess"]["random_distribution"]["bagging"]["activate"] \
                or "pasting_Rvotes" in context["preprocess"]["random_distribution"] and \
                        context["preprocess"]["random_distribution"]["pasting_Rvotes"]["activate"]:
            if context["preprocess"]["random_distribution"][
                    "group_successive"]:
                for kind_of in filters:
                    for filter in filters[kind_of]:
                        for i in range(len(filter)):
                            filter[i] = (
                                filter[i] * context["preprocess"]
                                ["random_distribution"]["group_successive"])
                            for j in range(
                                    1, context["preprocess"]
                                ["random_distribution"]["group_successive"]):
                                filter.append(filter[i] + j)

        path_exists(dir_name)

        self._generate_new_patterns_random_distribution(
            context, filters, dir_name)
Beispiel #7
0
    def k_fold(self, context, filters):
        classes_texts = context["classifiers"][context["classifier_list"]
                                               [0]]["classes_names"]
        num_instances = sum([
            len(context["patterns"].patterns[context["classifier_list"][0]][x])
            for x in context["patterns"].patterns[context["classifier_list"]
                                                  [0]]
        ])

        data_set = None
        for i, filter_name in enumerate(context["patterns"].patterns[
                context["classifier_list"][0]].keys()):
            if i == 0:
                data_set = context["patterns"].patterns[
                    context["classifier_list"][0]][filter_name]
            else:
                data_set = np.concatenate(
                    data_set, context["patterns"].patterns[
                        context["classifier_list"][0]][filter_name])

        total_classes_counter, classes_indexes = self.classes_counter_indexes(
            context, data_set)
        classes_counter = AutoVivification()
        min_limit_classes = np.min([
            total_classes_counter[class_counter]
            for class_counter in total_classes_counter
        ])

        for i in range(context["preprocess"]["random_distribution"]
                       ["number_base_classifiers"]):
            total_indexes = []
            for j, filter_name in enumerate(["learning", "validation"]):
                aux_list = []
                aux_percent = context["preprocess"]["random_distribution"][
                    "k_fold"]["percents"][filter_name]
                if j == len(context["preprocess"]["random_distribution"]
                            ["k_fold"]["percents"]) - 1:
                    filters[filter_name].append([
                        x for x in range(len(data_set))
                        if x not in total_indexes
                    ])
                    break
                else:
                    if context["preprocess"]["random_distribution"]["k_fold"][
                            "balanced"]:
                        total_instances = 0
                        for class_text in context["classifiers"][context[
                                "classifier_list"][0]]["classes_names"]:
                            classes_counter[filter_name][class_text] = np.ceil(
                                aux_percent * min_limit_classes)
                            total_instances += classes_counter[filter_name][
                                class_text]
                    else:
                        total_instances = np.ceil(aux_percent * num_instances)

                len_inputs = len(data_set[0]) - len(classes_texts)
                while len(aux_list) != total_instances:
                    value = np.random.randint(0, len(data_set))
                    if value not in total_indexes:
                        if context["preprocess"]["random_distribution"][
                                "k_fold"]["balanced"]:
                            if classes_counter[filter_name][classes_texts[list(
                                    data_set[value][len_inputs:]).index(
                                        1)]] > 0:
                                total_indexes.append(value)
                                aux_list.append(value)
                                classes_counter[filter_name][classes_texts[
                                    list(data_set[value][len_inputs:]).index(
                                        1)]] -= 1
                        else:
                            total_indexes.append(value)
                            aux_list.append(value)

                filters[filter_name].append(aux_list)
    def classes_error(self, context, classifier_name):

        self.info[classifier_name]["selection_errors"] = []

        statistics_class = Statistics()
        values = AutoVivification()
        pattern_kind = context["pattern_kind"]
        outputs_kind = context["outputs_kind"]

        if classifier_name in context["classifier_list"]:
            temporal_patterns = copy.deepcopy(
                context["patterns"].patterns[classifier_name][pattern_kind])
        else:
            original = self.info[classifier_name][outputs_kind][pattern_kind]
            original_pattern_ref = context["patterns"].patterns[
                classifier_name][pattern_kind]

        for i in range(
                1,
                len(context["classifiers"][classifier_name]["classes_names"])):
            temp = [1] * i
            temp.extend([-1] * (len(
                context["classifiers"][classifier_name]["classes_names"]) - i))
            values[i] = [temp]
            for new in permutations(values[i][0]):
                if new not in values[i]:
                    values[i].append(new)

            if classifier_name in context["classifier_list"]:
                context["patterns"].modify_patterns_temporally(
                    classifier_name, pattern_kind,
                    context["patterns"].filter_classes(classifier_name,
                                                       pattern_kind,
                                                       values[i]))
                self.build_real_outputs(context, classifier_name, pattern_kind)
                self.discretize_outputs(context, classifier_name, pattern_kind)
                ref_patterns = context["patterns"].patterns[classifier_name][
                    pattern_kind]
            else:
                positions = [
                    position
                    for position, instance in enumerate(original_pattern_ref)
                    if instance[1] in values[i]
                ]
                self.info[classifier_name][outputs_kind][pattern_kind] = \
                    [original[i] for i in range(len(original)) if i in positions]
                ref_patterns = [
                    original_pattern_ref[i]
                    for i in range(len(original_pattern_ref)) if i in positions
                ]

            statistics_class.goodness(
                context, classifier_name,
                self.info[classifier_name][outputs_kind][pattern_kind],
                ref_patterns)
            self.info[classifier_name]["selection_errors"].append(
                statistics_class.measures[classifier_name]['E'])

            if classifier_name in context["classifier_list"]:
                #Recovery the original patterns
                context["patterns"].modify_patterns_temporally(
                    classifier_name, pattern_kind, temporal_patterns)
                self.build_real_outputs(context, classifier_name, pattern_kind)
                self.discretize_outputs(context, classifier_name, pattern_kind)
            else:
                self.info[classifier_name][outputs_kind][
                    pattern_kind] = original
                from mullpy.ensembles import Ensemble

                Ensemble(context, classifier_name, self, [pattern_kind])
 def __init__(self):
     """
     Internal structure as AutoVivification class
     """
     self.info = AutoVivification()
Beispiel #10
0
 def __init__(self, context):
     self.patterns = AutoVivification()
     for classifier_name in context["classifier_list"]:
         for pattern_kind in context["patterns_texts"]:
             self.patterns[classifier_name][pattern_kind] = None
 def __init__(self):
     """
     Initialize the internal structure as AutoVivification class
     """
     self.measures = AutoVivification()