def threshold_determination(self, context, classifier_name, patterns_outputs): """ With the discretized outputs for roc values, determine the best values for the threshold. """ statistics_class = Statistics() #Aux structures threshold_list = AutoVivification() minimum_error = AutoVivification() for class_text in context["classifiers"][classifier_name][ "classes_names"]: #Initialize the aux structures threshold_list[class_text] = [] minimum_error[class_text] = float('inf') self.info[classifier_name][class_text]["threshold"][ "medium"] = float('inf') self.info[classifier_name][class_text]["threshold"][ "minimum"] = float('inf') self.info[classifier_name][class_text]["threshold"][ "maximum"] = float('-inf') #For each value of threshold generated for threshold in self.info[classifier_name]["roc_outputs"]: #Calculate the goodness of the classifier statistics_class.goodness( context, classifier_name, self.info[classifier_name]["roc_outputs"][threshold], patterns_outputs) for class_text in context["classifiers"][classifier_name][ "classes_names"]: error = 0.0 for function in context["classifiers"][classifier_name][ "thresholds"]["metric"]: getattr(statistics_class, function)(classifier_name, context, self, "validation") error += statistics_class.measures[classifier_name][ class_text][function] #If we find a minimum error, we save it if error < minimum_error[class_text]: minimum_error[class_text] = error threshold_list[class_text] = [threshold] #When we find a new global minimum we have to reset the list #And save it again #If there is a tie in terms of goodness, save all the range of values with the minimum error if error == minimum_error[class_text]: threshold_list[class_text].append(threshold) #Determine different kinds of thresholds if len(threshold_list[class_text]) == 0: raise ValueError("There is no threshold selected") return threshold_list
def classes_counter_indexes(context, data_set): classes_counter = AutoVivification() classes_indexes = AutoVivification() classes_texts = context["classifiers"][context["classifier_list"] [0]]["classes_names"] len_inputs = len(data_set[0]) - len(classes_texts) for class_text in classes_texts: column = [ data_set[i][len_inputs + classes_texts.index(class_text)] for i in range(len(data_set)) ] classes_counter[class_text] = np.sum(column) classes_indexes[class_text] = column return classes_counter, classes_indexes
def structure_combined_features(): from mullpy.auxiliar import AutoVivification structure = AutoVivification() i = 0 for amount in range(2, 5 + 1): temporal = list( itertools.combinations([ "AGE", "EDUC", "LIMMTOTAL", "FAQ", "MMSE", "GDS", "LDELTOTAL" ], amount)) for t in temporal: structure[i] = list(t) i += 1 return structure
def select_best_configuration_each_combination(in_file, out_file): # Sólo muestra la mejor configuración para cada combinación f = open(in_file) f2 = open(out_file, "w") lines = f.readlines() resultados = AutoVivification() for line in lines: resultados[line[:line.find(":")]] = line[line.find("\t") + 1:] temp = [] for classifier_name in reversed( [x for x in sorted(resultados.keys(), key=lambda y: resultados[y])]): res = re.search(r'[0-9]+', classifier_name[:classifier_name.find("_")]) nombre = classifier_name[res.start():res.end()] if nombre not in temp: f2.write(classifier_name + ":\t") f2.write("%.4f\n" % (float(resultados[classifier_name]))) temp.append(nombre) f.close() f2.close()
def __init__(self, context, ensemble_name, information, pattern_kind_list): """ Complete the Information class with the ensembles decisions Self.info as a AutoVivification class might contain only ensemble internal information Build real and discretized outputs of the Ensemble, depending of the Ensemble kind. """ self.info = AutoVivification() self.weights = None self.determine_ensemble_threshold(context, ensemble_name) for pattern_kind in pattern_kind_list: self._init_decision_matrix(context, ensemble_name, pattern_kind) self._build_decision_matrix(context, ensemble_name, information, pattern_kind) if nested_dict_access( ["classifiers", ensemble_name, "meta_learner"], context): self.meta_learner(context, ensemble_name, information) else: self._schedule_decisions(context, ensemble_name, information, pattern_kind)
def random_distribution(self, context): """ Bagging methods come in many flavours but mostly differ from each other by the way they draw random subsets of the training set: -When random subsets of the dataset are drawn as random subsets of the samples, then this algorithm is known as Pasting Rvotes. -When samples are drawn with replacement, then the method is known as Bagging. -When random subsets of the dataset are drawn as random subsets of the features, then the method is known as Random Subspaces. -When base estimators are built on subsets of both samples and features, then the method is known as Random Patches. group_successive variable groups each X instances. Each of these successive instances has to be together in the sampling process """ total_length = 0 lengths = AutoVivification() for pattern_kind in context["patterns"].patterns[ context["classifier_list"][0]]: lengths[pattern_kind] = len(context["patterns"].patterns[ context["classifier_list"][0]][pattern_kind]) total_length += lengths[pattern_kind] #Check if the length of patterns have the same size for classifier_name in context["classifier_list"]: for pattern_kind in context["patterns"].patterns[classifier_name]: if len(context["patterns"].patterns[classifier_name] [pattern_kind]) != lengths[pattern_kind]: raise ValueError( 'The length of the %s pattern of classifier %s has different size from others' % pattern_kind, classifier_name) if context["preprocess"]["random_distribution"]["group_successive"]: total_length = int(total_length / context["preprocess"] ["random_distribution"]["group_successive"]) for pattern_kind in lengths: lengths[pattern_kind] = int( lengths[pattern_kind] / context["preprocess"] ["random_distribution"]["group_successive"]) dir_name = context["general_path"] + "patterns/" + context[ "classifiers"][context["classifier_list"][0]]["set"] filters = AutoVivification() ###Specific kind of sampling### ############# ######BAGGING ############# if "bagging" in context["preprocess"]["random_distribution"] and \ context["preprocess"]["random_distribution"]["bagging"]["activate"]: for pattern_kind in context["patterns_texts"]: filters[pattern_kind] = [] self.bagging(context, filters, lengths, total_length) dir_name += "_bagging/" ############# ######PASTING ############# elif "pasting_Rvotes" in context["preprocess"]["random_distribution"] and \ context["preprocess"]["random_distribution"]["pasting_Rvotes"]["activate"]: for pattern_kind in context["patterns_texts"]: filters[pattern_kind] = [] self.pasting_rvotes(context, filters, lengths, total_length) dir_name += "_pasting_Rvotes/" ################# #RANDOM SUBSPACES ################# elif "random_subspaces" in context["preprocess"]["random_distribution"] and \ context["preprocess"]["random_distribution"]["random_subspaces"]["activate"]: features_amount = self.check_features_amount(context) for pattern_kind in context["patterns_texts"]: filters[pattern_kind] = [] self.random_subspaces(context, filters, features_amount) dir_name += "_random_subspaces/" ############# #COMBINATIONS ############# elif "all_features_combination" in context["preprocess"]["random_distribution"] and \ context["preprocess"]["random_distribution"]["all_features_combination"]["activate"]: features_amount = self.check_features_amount(context) for pattern_kind in context["patterns_texts"]: filters[pattern_kind] = [] self.all_features_combination(context, filters, features_amount) dir_name += "_features_combination/" context["preprocess"]["random_distribution"][ "number_base_classifiers"] = len(filters["learning"]) ############### #RANDOM PATCHES ############### elif "random_patches" in context["preprocess"]["random_distribution"] and \ context["preprocess"]["random_distribution"]["random_patches"]["activate"]: dir_name += "_random_patches/" ############### #K-FOLD ############### elif "k_fold" in context["preprocess"]["random_distribution"] and \ context["preprocess"]["random_distribution"]["k_fold"]["activate"]: for pattern_kind in context["preprocess"]["random_distribution"][ "k_fold"]["percents"]: filters[pattern_kind] = [] self.k_fold(context, filters) dir_name += "_k_fold/" ############### #Forecasting distribution ############### elif "forecasting_distribution" in context["preprocess"]["random_distribution"] and \ context["preprocess"]["random_distribution"]["forecasting_distribution"]["activate"]: self.forecasting_distribution(context, filters) dir_name += "_walking_forward/" ###Common functions### elif "bagging" in context["preprocess"]["random_distribution"] and \ context["preprocess"]["random_distribution"]["bagging"]["activate"] \ or "pasting_Rvotes" in context["preprocess"]["random_distribution"] and \ context["preprocess"]["random_distribution"]["pasting_Rvotes"]["activate"]: if context["preprocess"]["random_distribution"][ "group_successive"]: for kind_of in filters: for filter in filters[kind_of]: for i in range(len(filter)): filter[i] = ( filter[i] * context["preprocess"] ["random_distribution"]["group_successive"]) for j in range( 1, context["preprocess"] ["random_distribution"]["group_successive"]): filter.append(filter[i] + j) path_exists(dir_name) self._generate_new_patterns_random_distribution( context, filters, dir_name)
def k_fold(self, context, filters): classes_texts = context["classifiers"][context["classifier_list"] [0]]["classes_names"] num_instances = sum([ len(context["patterns"].patterns[context["classifier_list"][0]][x]) for x in context["patterns"].patterns[context["classifier_list"] [0]] ]) data_set = None for i, filter_name in enumerate(context["patterns"].patterns[ context["classifier_list"][0]].keys()): if i == 0: data_set = context["patterns"].patterns[ context["classifier_list"][0]][filter_name] else: data_set = np.concatenate( data_set, context["patterns"].patterns[ context["classifier_list"][0]][filter_name]) total_classes_counter, classes_indexes = self.classes_counter_indexes( context, data_set) classes_counter = AutoVivification() min_limit_classes = np.min([ total_classes_counter[class_counter] for class_counter in total_classes_counter ]) for i in range(context["preprocess"]["random_distribution"] ["number_base_classifiers"]): total_indexes = [] for j, filter_name in enumerate(["learning", "validation"]): aux_list = [] aux_percent = context["preprocess"]["random_distribution"][ "k_fold"]["percents"][filter_name] if j == len(context["preprocess"]["random_distribution"] ["k_fold"]["percents"]) - 1: filters[filter_name].append([ x for x in range(len(data_set)) if x not in total_indexes ]) break else: if context["preprocess"]["random_distribution"]["k_fold"][ "balanced"]: total_instances = 0 for class_text in context["classifiers"][context[ "classifier_list"][0]]["classes_names"]: classes_counter[filter_name][class_text] = np.ceil( aux_percent * min_limit_classes) total_instances += classes_counter[filter_name][ class_text] else: total_instances = np.ceil(aux_percent * num_instances) len_inputs = len(data_set[0]) - len(classes_texts) while len(aux_list) != total_instances: value = np.random.randint(0, len(data_set)) if value not in total_indexes: if context["preprocess"]["random_distribution"][ "k_fold"]["balanced"]: if classes_counter[filter_name][classes_texts[list( data_set[value][len_inputs:]).index( 1)]] > 0: total_indexes.append(value) aux_list.append(value) classes_counter[filter_name][classes_texts[ list(data_set[value][len_inputs:]).index( 1)]] -= 1 else: total_indexes.append(value) aux_list.append(value) filters[filter_name].append(aux_list)
def classes_error(self, context, classifier_name): self.info[classifier_name]["selection_errors"] = [] statistics_class = Statistics() values = AutoVivification() pattern_kind = context["pattern_kind"] outputs_kind = context["outputs_kind"] if classifier_name in context["classifier_list"]: temporal_patterns = copy.deepcopy( context["patterns"].patterns[classifier_name][pattern_kind]) else: original = self.info[classifier_name][outputs_kind][pattern_kind] original_pattern_ref = context["patterns"].patterns[ classifier_name][pattern_kind] for i in range( 1, len(context["classifiers"][classifier_name]["classes_names"])): temp = [1] * i temp.extend([-1] * (len( context["classifiers"][classifier_name]["classes_names"]) - i)) values[i] = [temp] for new in permutations(values[i][0]): if new not in values[i]: values[i].append(new) if classifier_name in context["classifier_list"]: context["patterns"].modify_patterns_temporally( classifier_name, pattern_kind, context["patterns"].filter_classes(classifier_name, pattern_kind, values[i])) self.build_real_outputs(context, classifier_name, pattern_kind) self.discretize_outputs(context, classifier_name, pattern_kind) ref_patterns = context["patterns"].patterns[classifier_name][ pattern_kind] else: positions = [ position for position, instance in enumerate(original_pattern_ref) if instance[1] in values[i] ] self.info[classifier_name][outputs_kind][pattern_kind] = \ [original[i] for i in range(len(original)) if i in positions] ref_patterns = [ original_pattern_ref[i] for i in range(len(original_pattern_ref)) if i in positions ] statistics_class.goodness( context, classifier_name, self.info[classifier_name][outputs_kind][pattern_kind], ref_patterns) self.info[classifier_name]["selection_errors"].append( statistics_class.measures[classifier_name]['E']) if classifier_name in context["classifier_list"]: #Recovery the original patterns context["patterns"].modify_patterns_temporally( classifier_name, pattern_kind, temporal_patterns) self.build_real_outputs(context, classifier_name, pattern_kind) self.discretize_outputs(context, classifier_name, pattern_kind) else: self.info[classifier_name][outputs_kind][ pattern_kind] = original from mullpy.ensembles import Ensemble Ensemble(context, classifier_name, self, [pattern_kind])
def __init__(self): """ Internal structure as AutoVivification class """ self.info = AutoVivification()
def __init__(self, context): self.patterns = AutoVivification() for classifier_name in context["classifier_list"]: for pattern_kind in context["patterns_texts"]: self.patterns[classifier_name][pattern_kind] = None
def __init__(self): """ Initialize the internal structure as AutoVivification class """ self.measures = AutoVivification()