def threshold_determination(self, context, classifier_name, patterns_outputs): """ With the discretized outputs for roc values, determine the best values for the threshold. """ statistics_class = Statistics() #Aux structures threshold_list = AutoVivification() minimum_error = AutoVivification() for class_text in context["classifiers"][classifier_name][ "classes_names"]: #Initialize the aux structures threshold_list[class_text] = [] minimum_error[class_text] = float('inf') self.info[classifier_name][class_text]["threshold"][ "medium"] = float('inf') self.info[classifier_name][class_text]["threshold"][ "minimum"] = float('inf') self.info[classifier_name][class_text]["threshold"][ "maximum"] = float('-inf') #For each value of threshold generated for threshold in self.info[classifier_name]["roc_outputs"]: #Calculate the goodness of the classifier statistics_class.goodness( context, classifier_name, self.info[classifier_name]["roc_outputs"][threshold], patterns_outputs) for class_text in context["classifiers"][classifier_name][ "classes_names"]: error = 0.0 for function in context["classifiers"][classifier_name][ "thresholds"]["metric"]: getattr(statistics_class, function)(classifier_name, context, self, "validation") error += statistics_class.measures[classifier_name][ class_text][function] #If we find a minimum error, we save it if error < minimum_error[class_text]: minimum_error[class_text] = error threshold_list[class_text] = [threshold] #When we find a new global minimum we have to reset the list #And save it again #If there is a tie in terms of goodness, save all the range of values with the minimum error if error == minimum_error[class_text]: threshold_list[class_text].append(threshold) #Determine different kinds of thresholds if len(threshold_list[class_text]) == 0: raise ValueError("There is no threshold selected") return threshold_list
def classes_counter_indexes(context, data_set): classes_counter = AutoVivification() classes_indexes = AutoVivification() classes_texts = context["classifiers"][context["classifier_list"] [0]]["classes_names"] len_inputs = len(data_set[0]) - len(classes_texts) for class_text in classes_texts: column = [ data_set[i][len_inputs + classes_texts.index(class_text)] for i in range(len(data_set)) ] classes_counter[class_text] = np.sum(column) classes_indexes[class_text] = column return classes_counter, classes_indexes
def select_best_configuration_each_combination(in_file, out_file): # Sólo muestra la mejor configuración para cada combinación f = open(in_file) f2 = open(out_file, "w") lines = f.readlines() resultados = AutoVivification() for line in lines: resultados[line[:line.find(":")]] = line[line.find("\t") + 1:] temp = [] for classifier_name in reversed([x for x in sorted(resultados.keys(), key=lambda y: resultados[y])]): res = re.search(r'[0-9]+', classifier_name[:classifier_name.find("_")]) nombre = classifier_name[res.start():res.end()] if nombre not in temp: f2.write(classifier_name + ":\t") f2.write("%.4f\n" % (float(resultados[classifier_name]))) temp.append(nombre) f.close() f2.close()
def select_best_configuration_each_combination(in_file, out_file): # Sólo muestra la mejor configuración para cada combinación f = open(in_file) f2 = open(out_file, "w") lines = f.readlines() resultados = AutoVivification() for line in lines: resultados[line[:line.find(":")]] = line[line.find("\t") + 1:] temp = [] for classifier_name in reversed( [x for x in sorted(resultados.keys(), key=lambda y: resultados[y])]): res = re.search(r'[0-9]+', classifier_name[:classifier_name.find("_")]) nombre = classifier_name[res.start():res.end()] if nombre not in temp: f2.write(classifier_name + ":\t") f2.write("%.4f\n" % (float(resultados[classifier_name]))) temp.append(nombre) f.close() f2.close()
def structure_combined_features(): from mullpy.auxiliar import AutoVivification structure = AutoVivification() i = 0 for amount in range(2, 5 + 1): temporal = list( itertools.combinations([ "AGE", "EDUC", "LIMMTOTAL", "FAQ", "MMSE", "GDS", "LDELTOTAL" ], amount)) for t in temporal: structure[i] = list(t) i += 1 return structure
def __init__(self, context, ensemble_name, information, pattern_kind_list): """ Complete the Information class with the ensembles decisions Self.info as a AutoVivification class might contain only ensemble internal information Build real and discretized outputs of the Ensemble, depending of the Ensemble kind. """ self.info = AutoVivification() self.weights = None self.determine_ensemble_threshold(context, ensemble_name) for pattern_kind in pattern_kind_list: self._init_decision_matrix(context, ensemble_name, pattern_kind) self._build_decision_matrix(context, ensemble_name, information, pattern_kind) if nested_dict_access( ["classifiers", ensemble_name, "meta_learner"], context): self.meta_learner(context, ensemble_name, information) else: self._schedule_decisions(context, ensemble_name, information, pattern_kind)
def random_distribution(self, context): """ Bagging methods come in many flavours but mostly differ from each other by the way they draw random subsets of the training set: -When random subsets of the dataset are drawn as random subsets of the samples, then this algorithm is known as Pasting Rvotes. -When samples are drawn with replacement, then the method is known as Bagging. -When random subsets of the dataset are drawn as random subsets of the features, then the method is known as Random Subspaces. -When base estimators are built on subsets of both samples and features, then the method is known as Random Patches. group_successive variable groups each X instances. Each of these successive instances has to be together in the sampling process """ total_length = 0 lengths = AutoVivification() for pattern_kind in context["patterns"].patterns[ context["classifier_list"][0]]: lengths[pattern_kind] = len(context["patterns"].patterns[ context["classifier_list"][0]][pattern_kind]) total_length += lengths[pattern_kind] #Check if the length of patterns have the same size for classifier_name in context["classifier_list"]: for pattern_kind in context["patterns"].patterns[classifier_name]: if len(context["patterns"].patterns[classifier_name] [pattern_kind]) != lengths[pattern_kind]: raise ValueError( 'The length of the %s pattern of classifier %s has different size from others' % pattern_kind, classifier_name) if context["preprocess"]["random_distribution"]["group_successive"]: total_length = int(total_length / context["preprocess"] ["random_distribution"]["group_successive"]) for pattern_kind in lengths: lengths[pattern_kind] = int( lengths[pattern_kind] / context["preprocess"] ["random_distribution"]["group_successive"]) dir_name = context["general_path"] + "patterns/" + context[ "classifiers"][context["classifier_list"][0]]["set"] filters = AutoVivification() ###Specific kind of sampling### ############# ######BAGGING ############# if "bagging" in context["preprocess"]["random_distribution"] and \ context["preprocess"]["random_distribution"]["bagging"]["activate"]: for pattern_kind in context["patterns_texts"]: filters[pattern_kind] = [] self.bagging(context, filters, lengths, total_length) dir_name += "_bagging/" ############# ######PASTING ############# elif "pasting_Rvotes" in context["preprocess"]["random_distribution"] and \ context["preprocess"]["random_distribution"]["pasting_Rvotes"]["activate"]: for pattern_kind in context["patterns_texts"]: filters[pattern_kind] = [] self.pasting_rvotes(context, filters, lengths, total_length) dir_name += "_pasting_Rvotes/" ################# #RANDOM SUBSPACES ################# elif "random_subspaces" in context["preprocess"]["random_distribution"] and \ context["preprocess"]["random_distribution"]["random_subspaces"]["activate"]: features_amount = self.check_features_amount(context) for pattern_kind in context["patterns_texts"]: filters[pattern_kind] = [] self.random_subspaces(context, filters, features_amount) dir_name += "_random_subspaces/" ############# #COMBINATIONS ############# elif "all_features_combination" in context["preprocess"]["random_distribution"] and \ context["preprocess"]["random_distribution"]["all_features_combination"]["activate"]: features_amount = self.check_features_amount(context) for pattern_kind in context["patterns_texts"]: filters[pattern_kind] = [] self.all_features_combination(context, filters, features_amount) dir_name += "_features_combination/" context["preprocess"]["random_distribution"][ "number_base_classifiers"] = len(filters["learning"]) ############### #RANDOM PATCHES ############### elif "random_patches" in context["preprocess"]["random_distribution"] and \ context["preprocess"]["random_distribution"]["random_patches"]["activate"]: dir_name += "_random_patches/" ############### #K-FOLD ############### elif "k_fold" in context["preprocess"]["random_distribution"] and \ context["preprocess"]["random_distribution"]["k_fold"]["activate"]: for pattern_kind in context["preprocess"]["random_distribution"][ "k_fold"]["percents"]: filters[pattern_kind] = [] self.k_fold(context, filters) dir_name += "_k_fold/" ############### #Forecasting distribution ############### elif "forecasting_distribution" in context["preprocess"]["random_distribution"] and \ context["preprocess"]["random_distribution"]["forecasting_distribution"]["activate"]: self.forecasting_distribution(context, filters) dir_name += "_walking_forward/" ###Common functions### elif "bagging" in context["preprocess"]["random_distribution"] and \ context["preprocess"]["random_distribution"]["bagging"]["activate"] \ or "pasting_Rvotes" in context["preprocess"]["random_distribution"] and \ context["preprocess"]["random_distribution"]["pasting_Rvotes"]["activate"]: if context["preprocess"]["random_distribution"][ "group_successive"]: for kind_of in filters: for filter in filters[kind_of]: for i in range(len(filter)): filter[i] = ( filter[i] * context["preprocess"] ["random_distribution"]["group_successive"]) for j in range( 1, context["preprocess"] ["random_distribution"]["group_successive"]): filter.append(filter[i] + j) path_exists(dir_name) self._generate_new_patterns_random_distribution( context, filters, dir_name)
def k_fold(self, context, filters): classes_texts = context["classifiers"][context["classifier_list"] [0]]["classes_names"] num_instances = sum([ len(context["patterns"].patterns[context["classifier_list"][0]][x]) for x in context["patterns"].patterns[context["classifier_list"] [0]] ]) data_set = None for i, filter_name in enumerate(context["patterns"].patterns[ context["classifier_list"][0]].keys()): if i == 0: data_set = context["patterns"].patterns[ context["classifier_list"][0]][filter_name] else: data_set = np.concatenate( data_set, context["patterns"].patterns[ context["classifier_list"][0]][filter_name]) total_classes_counter, classes_indexes = self.classes_counter_indexes( context, data_set) classes_counter = AutoVivification() min_limit_classes = np.min([ total_classes_counter[class_counter] for class_counter in total_classes_counter ]) for i in range(context["preprocess"]["random_distribution"] ["number_base_classifiers"]): total_indexes = [] for j, filter_name in enumerate(["learning", "validation"]): aux_list = [] aux_percent = context["preprocess"]["random_distribution"][ "k_fold"]["percents"][filter_name] if j == len(context["preprocess"]["random_distribution"] ["k_fold"]["percents"]) - 1: filters[filter_name].append([ x for x in range(len(data_set)) if x not in total_indexes ]) break else: if context["preprocess"]["random_distribution"]["k_fold"][ "balanced"]: total_instances = 0 for class_text in context["classifiers"][context[ "classifier_list"][0]]["classes_names"]: classes_counter[filter_name][class_text] = np.ceil( aux_percent * min_limit_classes) total_instances += classes_counter[filter_name][ class_text] else: total_instances = np.ceil(aux_percent * num_instances) len_inputs = len(data_set[0]) - len(classes_texts) while len(aux_list) != total_instances: value = np.random.randint(0, len(data_set)) if value not in total_indexes: if context["preprocess"]["random_distribution"][ "k_fold"]["balanced"]: if classes_counter[filter_name][classes_texts[list( data_set[value][len_inputs:]).index( 1)]] > 0: total_indexes.append(value) aux_list.append(value) classes_counter[filter_name][classes_texts[ list(data_set[value][len_inputs:]).index( 1)]] -= 1 else: total_indexes.append(value) aux_list.append(value) filters[filter_name].append(aux_list)
def classes_error(self, context, classifier_name): self.info[classifier_name]["selection_errors"] = [] statistics_class = Statistics() values = AutoVivification() pattern_kind = context["pattern_kind"] outputs_kind = context["outputs_kind"] if classifier_name in context["classifier_list"]: temporal_patterns = copy.deepcopy( context["patterns"].patterns[classifier_name][pattern_kind]) else: original = self.info[classifier_name][outputs_kind][pattern_kind] original_pattern_ref = context["patterns"].patterns[ classifier_name][pattern_kind] for i in range( 1, len(context["classifiers"][classifier_name]["classes_names"])): temp = [1] * i temp.extend([-1] * (len( context["classifiers"][classifier_name]["classes_names"]) - i)) values[i] = [temp] for new in permutations(values[i][0]): if new not in values[i]: values[i].append(new) if classifier_name in context["classifier_list"]: context["patterns"].modify_patterns_temporally( classifier_name, pattern_kind, context["patterns"].filter_classes(classifier_name, pattern_kind, values[i])) self.build_real_outputs(context, classifier_name, pattern_kind) self.discretize_outputs(context, classifier_name, pattern_kind) ref_patterns = context["patterns"].patterns[classifier_name][ pattern_kind] else: positions = [ position for position, instance in enumerate(original_pattern_ref) if instance[1] in values[i] ] self.info[classifier_name][outputs_kind][pattern_kind] = \ [original[i] for i in range(len(original)) if i in positions] ref_patterns = [ original_pattern_ref[i] for i in range(len(original_pattern_ref)) if i in positions ] statistics_class.goodness( context, classifier_name, self.info[classifier_name][outputs_kind][pattern_kind], ref_patterns) self.info[classifier_name]["selection_errors"].append( statistics_class.measures[classifier_name]['E']) if classifier_name in context["classifier_list"]: #Recovery the original patterns context["patterns"].modify_patterns_temporally( classifier_name, pattern_kind, temporal_patterns) self.build_real_outputs(context, classifier_name, pattern_kind) self.discretize_outputs(context, classifier_name, pattern_kind) else: self.info[classifier_name][outputs_kind][ pattern_kind] = original from mullpy.ensembles import Ensemble Ensemble(context, classifier_name, self, [pattern_kind])
def __init__(self): """ Internal structure as AutoVivification class """ self.info = AutoVivification()
def __init__(self, context): self.patterns = AutoVivification() for classifier_name in context["classifier_list"]: for pattern_kind in context["patterns_texts"]: self.patterns[classifier_name][pattern_kind] = None
def __init__(self): """ Initialize the internal structure as AutoVivification class """ self.measures = AutoVivification()
class Statistics: """ The class where are defined all statistics functions as goodness, standard deviation or mean square error. All the information relative to the classifiers is saved on the class structure indexable by name """ def __init__(self): """ Initialize the internal structure as AutoVivification class """ self.measures = AutoVivification() ##################################################### @staticmethod def change_ranges(value, **kwargs): """ Project a given value, from old ranges to new ranges """ if len(kwargs.keys()) != 4: raise ValueError("Change ranges need 4 parameters") old_min = kwargs["oldMin"] old_max = kwargs["oldMax"] new_max = kwargs["newMax"] new_min = kwargs["newMin"] old_range = old_max - old_min new_range = new_max - new_min old_value = value return (((old_value - old_min) * new_range) / old_range) + new_min ############################################# def rms(self, classifier_name, context, information, pattern_kind): """ Calculate all rms to different patterns kind relative to the classifier. """ list_outputs_classifier = information.info[classifier_name]["continuous_outputs"][pattern_kind] self.measures[classifier_name]["rms"][pattern_kind] = 0.0 pattern = copy.deepcopy(context["patterns"].patterns[classifier_name][pattern_kind]) #Difference between desired outputs(patterns) and the real outputs classes_texts = context["classifiers"][classifier_name]["classes_names"] len_inputs = len(pattern[0]) - len(classes_texts) for outputs, desired in zip(list_outputs_classifier, pattern): if context["classifiers"][classifier_name]["patterns"]["range"] is not [0, 1]: for i, desire in enumerate(desired[len_inputs:]): desired[len_inputs:][i] = \ self.change_ranges( desire, oldMin=context["classifiers"][classifier_name]["patterns"]["range"][0], oldMax=context["classifiers"][classifier_name]["patterns"]["range"][1], newMin=0, newMax=1) self.measures[classifier_name]["rms"][pattern_kind] += sum(0.5 * (desired[len_inputs:] - outputs) ** 2) self.measures[classifier_name]["rms"][pattern_kind] /= float(len(pattern)) ############################################# @staticmethod def discretize_outputs(value): """ Used like a lambda function """ if value == -1: return 0. return value ############################################# def initialize_goodness(self, context, classifier_name, instances_number, classes_names): #Initialize the structure of goodness values. for values_kind in ['fp', 'fn', 'tp', 'tn']: self.measures[classifier_name]["matrix"][values_kind] = \ np.zeros([instances_number, len(classes_names)], dtype=np.float16) self.measures[classifier_name][values_kind] = 0.0 for class_text in classes_names: self.measures[classifier_name][class_text][values_kind] = 0.0 ############################################# def build_list_oracle_outputs(self, classifier_name): self.measures[classifier_name]["matrix"]["oracle_outputs"] = \ self.measures[classifier_name]["matrix"]["tp"] + self.measures[classifier_name]["matrix"]["tn"] ############################################# def goodness(self, context, classifier_name, list_outputs_classifier, pattern_outputs): """ Calculate the goodness of the classifier. It contain an error formula to penalize more the instances with one class, and less with more classes presents in the same instances. It is a generalization of the multiclass problem. Calculate the goodness in terms of FP, FN, TP, TN and different kinds of error as global error, false positive error, false negative error. """ #TODO:Change the input parameters from list outputs and patterns to Information if not len(pattern_outputs): raise NameError('Statistics doesnt get the patterns of the classifier %s correctly at dir %s' % (classifier_name, context["classifiers"][classifier_name]["paths"]["patterns"])) if not len(list_outputs_classifier): raise NameError('Statistics doesnt get the outputs of the classifier %s correctly' % classifier_name) if len(list_outputs_classifier) != len(pattern_outputs): raise NameError('Different lengths in patterns and outputs on classifier %s' % classifier_name) ############################################# #To improve code readability classes_names = context["classifiers"][classifier_name]["classes_names"] instances_number = float(len(pattern_outputs)) len_inputs = len(pattern_outputs[0]) - len(classes_names) self.initialize_goodness(context, classifier_name, int(instances_number), classes_names) ############################################# #Measure the error by instance for instance in range(int(instances_number)): #Number of classes present in an instance. For multilabel problems for output_index, class_text in enumerate(classes_names): output_wanted = pattern_outputs[instance][len_inputs:][output_index] output = list_outputs_classifier[instance][output_index] if output == (-1.): output = 0. if output_wanted == (-1.): output_wanted = 0. #If there is an error if output_wanted != output: #If output wanted was activated means a FN if output_wanted == 1.0: #FN self.measures[classifier_name]["matrix"]['fn'][instance][output_index] = 1. else: # If not output wanted was activated means a FP self.measures[classifier_name]["matrix"]['fp'][instance][output_index] = 1. #No error else: #TP if output_wanted == 1.0: self.measures[classifier_name]["matrix"]['tp'][instance][output_index] = 1. #TN else: self.measures[classifier_name]["matrix"]['tn'][instance][output_index] = 1. ############################################# #The goodness values in terms of sum of the instances for good in self.measures[classifier_name]["matrix"].keys(): self.measures[classifier_name][good] = np.sum(self.measures[classifier_name]["matrix"][good]) for output_index, class_text in enumerate(classes_names): self.measures[classifier_name][class_text][good] = \ np.sum(self.measures[classifier_name]["matrix"][good], 0)[output_index] ######################################################################################### def error_fn(self, classifier_name, context, information, pattern_kind): pattern_outputs = context["patterns"].patterns[classifier_name][pattern_kind] classes_names = context["classifiers"][classifier_name]["classes_names"] self.measures[classifier_name]["error_fn"] = 0.0 for class_text in classes_names: self.measures[classifier_name][class_text]["error_fn"] = 0.0 for output_index, class_text in enumerate(classes_names): num_instances_of_the_class = np.sum([self.measures[classifier_name]["matrix"]['tp'][i][output_index] + self.measures[classifier_name]["matrix"]['fn'][i][output_index] for i in range(len(pattern_outputs))]) #The error depends on the number of instances of it class and on the total number of classes if len(classes_names) == 1: self.measures[classifier_name][class_text]["error_fn"] = \ 0.5 * np.sum(self.measures[classifier_name]["matrix"]["fn"], 0)[output_index] / \ num_instances_of_the_class self.measures[classifier_name][class_text]["error_fn"] = \ 0.5 * np.sum(self.measures[classifier_name]["matrix"]["fn"], 0)[output_index] / \ (float(len(pattern_outputs)) - num_instances_of_the_class) else: self.measures[classifier_name][class_text]["error_fn"] = \ (np.sum(self.measures[classifier_name]["matrix"]["fn"], 0)[output_index] / num_instances_of_the_class) / len(classes_names) for class_text in classes_names: self.measures[classifier_name]["error_fn"] += self.measures[classifier_name][class_text]["error_fn"] ######################################################################################### def error_fp(self, classifier_name, context, information, pattern_kind): pattern_outputs = context["patterns"].patterns[classifier_name][pattern_kind] classes_names = context["classifiers"][classifier_name]["classes_names"] self.measures[classifier_name]["error_fp"] = 0.0 for class_text in classes_names: self.measures[classifier_name][class_text]["error_fp"] = 0.0 for output_index, class_text in enumerate(classes_names): num_instances_of_the_class = np.sum([self.measures[classifier_name]["matrix"]['tp'][i][output_index] + self.measures[classifier_name]["matrix"]['fn'][i][output_index] for i in range(len(pattern_outputs))]) #The error depends on the number of instances of it class and on the total number of classes if len(classes_names) == 1: self.measures[classifier_name][class_text]["error_fp"] = \ 0.5 * np.sum(self.measures[classifier_name]["matrix"]["FP"], 0)[output_index] / \ num_instances_of_the_class self.measures[classifier_name][class_text]["error_fp"] = \ 0.5 * np.sum(self.measures[classifier_name]["matrix"]["FP"], 0)[output_index] / \ (float(len(pattern_outputs)) - num_instances_of_the_class) else: self.measures[classifier_name][class_text]["error_fp"] = \ (np.sum(self.measures[classifier_name]["matrix"]["FP"], 0)[output_index] / num_instances_of_the_class) / len(classes_names) for class_text in classes_names: self.measures[classifier_name]["error_fp"] += self.measures[classifier_name][class_text]["error_fp"] ######################################################################################### def error(self, classifier_name, context, information, pattern_kind): """ Calculate the errors of the classifier given by name. This error compensates the minority class by dividing each error class by the number of instances of that class, and finally divided by the number of classes. """ self.error_fp(classifier_name, context, information, pattern_kind) self.error_fn(classifier_name, context, information, pattern_kind) for class_text in context["classifiers"][classifier_name]["classes_names"]: self.measures[classifier_name][class_text]["error"] = \ self.measures[classifier_name][class_text]["error_fp"] + \ self.measures[classifier_name][class_text]["error_fn"] self.measures[classifier_name]["error"] = \ self.measures[classifier_name]["error_fp"] + self.measures[classifier_name]["error_fn"] ######################################################################################### def balanced_accuracy(self, classifier_name, context, *args): self.tnr(classifier_name, context) self.tpr(classifier_name, context) for class_text in context["classifiers"][classifier_name]["classes_names"]: self.measures[classifier_name][class_text]["balanced_accuracy"] = \ (self.measures[classifier_name][class_text]["tpr"] + self.measures[classifier_name][class_text]["tnr"]) / 2. self.measures[classifier_name]["balanced_accuracy"] = \ np.mean([self.measures[classifier_name][x]["balanced_accuracy"] for x in context["classifiers"][classifier_name]["classes_names"]]) ######################################################################################### def g_means(self, classifier_name, context, *args): """ Geometric mean as the sqrt of the sensibility*specificity """ self.tnr(classifier_name, context) self.tpr(classifier_name, context) self.measures[classifier_name]["g_means"] = np.sqrt(np.dot(self.measures[classifier_name]["tnr"], self.measures[classifier_name]["tpr"])) for class_text in context["classifiers"][classifier_name]["classes_names"]: self.measures[classifier_name][class_text]["g_means"] = \ np.sqrt(np.dot(self.measures[classifier_name][class_text]["tnr"], self.measures[classifier_name][class_text]["tpr"])) ######################################################################################### def tnr(self, classifier_name, context, *args): """ True Negative Rate """ fp = self.measures[classifier_name]["fp"] tn = self.measures[classifier_name]["tn"] if tn + fp > 0: self.measures[classifier_name]["tnr"] = np.divide(tn, tn + fp) else: self.measures[classifier_name]["tnr"] = 0.0 for class_text in context["classifiers"][classifier_name]["classes_names"]: fp = self.measures[classifier_name][class_text]["fp"] tn = self.measures[classifier_name][class_text]["tn"] if tn + fp > 0: self.measures[classifier_name]["tnr"] = np.divide(tn, tn + fp) else: self.measures[classifier_name]["tnr"] = 0.0 ######################################################################################### def tpr(self, classifier_name, context, *args): """ True Positive Rate """ tp = self.measures[classifier_name]["tp"] fn = self.measures[classifier_name]["fn"] if tp + fn > 0: self.measures[classifier_name]["tpr"] = np.divide(tp, tp + fn) else: self.measures[classifier_name]["tpr"] = 0.0 for class_text in context["classifiers"][classifier_name]["classes_names"]: tp = self.measures[classifier_name][class_text]["tp"] fn = self.measures[classifier_name][class_text]["fn"] if tp + fn > 0: self.measures[classifier_name]["tpr"] = np.divide(tp, tp + fn) else: self.measures[classifier_name]["tpr"] = 0.0 ######################################################################################### @staticmethod def get_ytrue_ypred(context, information, classifier_name, pattern_kind): len_classes = len(context["classifiers"][context["classifier_list"][0]]["classes_names"]) len_inputs = len(context["patterns"].patterns[classifier_name][pattern_kind][0]) - len_classes y_true = list(context["patterns"].patterns[classifier_name][pattern_kind][:, range(len_inputs, len_inputs + len_classes)]) y_pred = information.info[classifier_name]["continuous_outputs"][pattern_kind] return y_true, y_pred ######################################################################################### def explained_variance_score(self, classifier_name, context, information, pattern_kind): from sklearn.metrics import explained_variance_score y_true, y_pred = self.get_ytrue_ypred(context, information, classifier_name, pattern_kind) self.measures[classifier_name]["explained_variance_score"] = \ explained_variance_score(y_true, y_pred) ######################################################################################### def mean_absolute_error(self, classifier_name, context, information, pattern_kind): from sklearn.metrics import mean_absolute_error y_true, y_pred = self.get_ytrue_ypred(context, information, classifier_name, pattern_kind) self.measures[classifier_name]["explained_variance_score"] = \ mean_absolute_error(y_true, y_pred) ######################################################################################### def mean_squared_error(self, classifier_name, context, information, pattern_kind): from sklearn.metrics import mean_squared_error y_true, y_pred = self.get_ytrue_ypred(context, information, classifier_name, pattern_kind) self.measures[classifier_name]["mean_squared_error"] = \ mean_squared_error(y_true, y_pred) ######################################################################################### def r2_score(self, classifier_name, context, information, pattern_kind): from sklearn.metrics import r2_score y_true, y_pred = self.get_ytrue_ypred(context, information, classifier_name, pattern_kind) self.measures[classifier_name]["r2_score"] = \ r2_score(y_true, y_pred) ######################################################################################### @staticmethod def confusion_matrix(classifier_name, context, information, pattern_kind): from sklearn.metrics import confusion_matrix confusion_matrix(context["patterns"].patterns[classifier_name][pattern_kind], information.info[classifier_name]["discretized_outputs"][pattern_kind], context["classifiers"][classifier_name]["classes_names"]) ######################################################################################### @staticmethod def matthews_corrcoef(classifier_name, context, information, pattern_kind): from sklearn.metrics import matthews_corrcoef matthews_corrcoef(context["patterns"].patterns[classifier_name][pattern_kind], information.info[classifier_name]["discretized_outputs"][pattern_kind]) ######################################################################################### def hamming_loss(self, classifier_name, context, information, pattern_kind): from sklearn.metrics import hamming_loss self.measures[classifier_name]["hamming_loss"] = \ hamming_loss( context["patterns"].patterns[classifier_name][pattern_kind], information.info[classifier_name]["discretized_outputs"][pattern_kind]) ######################################################################################### def kappa(self, classifier_name, *args): self.measures[classifier_name]["kappa"] = \ self.measures[classifier_name]["matrix"]['tp'] + self.measures[classifier_name]["matrix"]['tn'] ######################################################################################### def f_measure(self, classifier_name, *args): self.recall(classifier_name, *args) self.accuracy(classifier_name, *args) self.measures[classifier_name]["f_measure"] = \ (2 * self.measures[classifier_name]["recall"] * self.measures[classifier_name]["accuracy"]) / \ (self.measures[classifier_name]["recall"] + self.measures[classifier_name]["accuracy"]) ######################################################################################### def accuracy(self, classifier_name, *args): self.measures[classifier_name]["accuracy"] = \ self.measures[classifier_name]['tp'] / ( self.measures[classifier_name]['tp'] + self.measures[classifier_name]['fp']) ######################################################################################### def error_rate(self, classifier_name, *args): self.measures[classifier_name]["error_rate"] = \ self.measures[classifier_name]["matrix"]['tp'] + self.measures[classifier_name]["matrix"]['tn'] / \ (np.sum([self.measures[classifier_name]["matrix"][goodness] for goodness in self.measures[classifier_name]["matrix"].keys()])) ######################################################################################### def accuracy_rate(self, classifier_name, *args): self.measures[classifier_name]["accuracy_rate"] = \ self.measures[classifier_name]["matrix"]['fp'] + self.measures[classifier_name]["matrix"]['fn'] / \ (np.sum( [self.measures[classifier_name]["matrix"][goodness] for goodness in self.measures[classifier_name]["matrix"].keys()])) ######################################################################################### def recall(self, classifier_name, *args): self.tpr(classifier_name, *args) self.measures[classifier_name]["recall"] = self.measures[classifier_name]["tpr"] ######################################################################################### def fn_rate(self, classifier_name, *args): self.measures[classifier_name]["fn_rate"] = self.measures[classifier_name]["matrix"]['fn'] / ( self.measures[classifier_name]["matrix"]['tp'] + self.measures[classifier_name]["matrix"]['fn']) ######################################################################################### def fp_rate(self, classifier_name, *args): self.measures[classifier_name]["fp_rate"] = self.measures[classifier_name]["matrix"]['fp'] / ( self.measures[classifier_name]["matrix"]['tn'] + self.measures[classifier_name]["matrix"]['fp']) ######################################################################################### def auc(self, classifier_name, context, information, pattern_kind): from sklearn.metrics import roc_auc_score classes_names = context["classifiers"][classifier_name]["classes_names"] inputs = len(context["patterns"].patterns[classifier_name][pattern_kind][0]) - len(classes_names) self.measures[classifier_name]["auc"] = 0.0 for i, class_name in enumerate(classes_names): self.measures[classifier_name][class_name]["auc"] = \ roc_auc_score(context["patterns"].patterns[classifier_name][pattern_kind][:, inputs + i], information.info[classifier_name]["continuous_outputs"][pattern_kind][:, i]) self.measures[classifier_name]["auc"] += self.measures[classifier_name][class_name]["auc"] self.measures[classifier_name]["auc"] = np.divide(np.mean(self.measures[classifier_name]["auc"]), len(classes_names)) ######################################################################################### def std(self, classifier_name, context, *args): """ Calculate the standard deviation of the classifier passed as args, for each kind of error. Thus, there is a std for false positive error, another to false positive error, etc. """ self.measures[classifier_name]['dt_efp'] = np.std(self.measures[classifier_name]["matrix"]['efp']) self.measures[classifier_name]['dt_efn'] = np.std(self.measures[classifier_name]["matrix"]['efn']) self.measures[classifier_name]['dt_e'] = np.std(self.measures[classifier_name]["matrix"]['efp'] + self.measures[classifier_name]["matrix"]['efn']) for output_index, class_text in enumerate(context["classifiers"][classifier_name]["classes_names"]): self.measures[classifier_name][class_text]['dt_efp'] = \ np.std(self.measures[classifier_name]["matrix"]["efp"], 0)[output_index] self.measures[classifier_name][class_text]['dt_efn'] = \ np.std(self.measures[classifier_name]["matrix"]["efn"], 0)[output_index] self.measures[classifier_name][class_text]['dt_e'] = \ np.std(self.measures[classifier_name]["matrix"]["e"], 0)[output_index] ############################################# @staticmethod def __build_multiple_name(sub_list): name = "" if type(sub_list) != list: for x_tuple in sub_list: name = "+".join([x for x in x_tuple]) else: for i, name_i in enumerate(sub_list): if i == len(sub_list) - 1: name += name_i else: name += name_i + "+" return name ############################################# def correctly_classified(self, sub_list): correctly_classified = np.zeros(len(self.measures[sub_list[0]]["matrix"]["oracle_outputs"])) for i in range(len(self.measures[sub_list[0]]["matrix"]["oracle_outputs"])): for j, classifier_name in enumerate(sub_list): if (np.array(self.measures[classifier_name]["matrix"]["oracle_outputs"][i]) == np.ones( len(self.measures[classifier_name]["matrix"]["oracle_outputs"][i]))).all(): correctly_classified[i] += 1 return correctly_classified ############################################# def interrater_agreement_k_non_pairwise(self, context, sub_list): error = 0.0 correctly_classified = self.correctly_classified(sub_list) p = np.sum([self.measures[x]['E'] for x in self.measures if 'E' in self.measures[x]]) / \ (len(sub_list) * len(self.measures[sub_list[0]]["matrix"]["oracle_outputs"])) for i in range(len(correctly_classified)): error += correctly_classified[i] * (len(sub_list) - correctly_classified[i]) if p == 0.0: p = np.exp(100) error /= len(self.measures[sub_list[0]]["matrix"]["oracle_outputs"]) * (len(sub_list) - 1) * p * (1 - p) return 1 - error ############################################# def difficulty(self, context, sub_list): error = 0.0 correctly_classified = self.correctly_classified(sub_list) mean_errors = np.mean(correctly_classified) for i in range(len(correctly_classified)): error += np.power((correctly_classified[i] - (correctly_classified[i] / mean_errors)), 2) error /= (len(self.measures[sub_list[0]]["matrix"]["oracle_outputs"]) * np.power(len(sub_list), 2)) return 1. - error ############################################# def kohavi_wolpert(self, context, sub_list): error = 0.0 correctly_classified = self.correctly_classified(sub_list) for i in range(len(correctly_classified)): error += correctly_classified[i] * (len(sub_list) - correctly_classified[i]) error /= len(sub_list) error /= (len(self.measures[sub_list[0]]["matrix"]["oracle_outputs"]) * np.power(len(sub_list), 2)) return error ############################################# def entropy(self, context, sub_list): Error = 0.0 correctly_classified = self.correctly_classified(sub_list) for i in range(len(correctly_classified)): Error += (min(correctly_classified[i], len(sub_list) - correctly_classified[i]) / (len(sub_list) - np.ceil(len(sub_list) / 2.))) Error /= len(self.measures[sub_list[0]]["matrix"]["oracle_outputs"]) return Error ############################################# def diversity_non_pairwise_structure(self, context, function, classifier_list): for i, classifier_name in enumerate(classifier_list): if context["interactive"]["activate"]: sys.stdout.write("\r{0}>".format("Completed:%f%%" % ((float(i) / len(classifier_list)) * 100))) sys.stdout.flush() # name = self.__build_multiple_name(sub_list) self.measures[classifier_name][function] = \ getattr(self, function)(context, context["classifiers"][classifier_name]["classifiers"]) ############################################# def diversity_pairwise_structure(self, context, function, classifier_list): for i, classifier_1 in enumerate(classifier_list): if context["interactive"]["activate"]: sys.stdout.write("\r{0}>".format("Completed:%f%%" % ((float(i) / len(classifier_list)) * 100))) sys.stdout.flush() for classifier_2 in context["classifiers"].keys(): if "pairwise_diversity" in self.measures[classifier_2].keys() and function in \ self.measures[classifier_2][ "pairwise_diversity"].keys() and classifier_1 in \ self.measures[classifier_2]["pairwise_diversity"][ function].keys(): self.measures[classifier_1]["pairwise_diversity"][function][classifier_2] = \ self.measures[classifier_2]["pairwise_diversity"][function][classifier_1] else: self.measures[classifier_1]["pairwise_diversity"][function][classifier_2] = \ getattr(self, function)(classifier_1, classifier_2, context) vector = [self.measures[classifier_1]["pairwise_diversity"][function][x] for x in self.measures[classifier_1]["pairwise_diversity"][function].keys() if x != classifier_1] self.measures[classifier_1]["pairwise_diversity"][function]["mean"] = np.mean(vector) self.measures[classifier_1]["pairwise_diversity"][function]["median"] = np.median(vector) self.measures[classifier_1]["pairwise_diversity"][function]["std"] = np.std(vector) self.measures[classifier_1]["pairwise_diversity"][function]["variance"] = np.var(vector) ############################################# def error_correlation(self, classifier_1, classifier_2, context): return np.corrcoef(self.measures[classifier_1]["matrix"]["e"], self.measures[classifier_2]["matrix"]["e"])[0][1] ############################################# def n01(self, classifier_1, classifier_2): counter = 0 for a, b in zip(self.measures[classifier_1]["matrix"]["oracle_outputs"], self.measures[classifier_2]["matrix"]["oracle_outputs"]): if np.sum(a) < len(a) and np.sum(b) == len(b): counter += 1 return counter ############################################# def n10(self, classifier_1, classifier_2): counter = 0 for a, b in zip(self.measures[classifier_1]["matrix"]["oracle_outputs"], self.measures[classifier_2]["matrix"]["oracle_outputs"]): if np.sum(a) == len(a) and np.sum(b) < len(b): counter += 1 return counter ############################################# def n11(self, classifier_1, classifier_2): counter = 0 for a, b in zip(self.measures[classifier_1]["matrix"]["oracle_outputs"], self.measures[classifier_2]["matrix"]["oracle_outputs"]): if (a == b).all() and np.sum(a) == len(a): counter += 1 return counter ############################################# def n00(self, classifier_1, classifier_2): counter = 0 for a, b in zip(self.measures[classifier_1]["matrix"]["oracle_outputs"], self.measures[classifier_2]["matrix"]["oracle_outputs"]): if np.sum(b) < len(b) and np.sum(a) < len(a): counter += 1 return counter ############################################# def _n_values(self, classifier_1, classifier_2, context): #this results may be divided n11 = None n00 = None n10 = None n01 = None if context["results"]["to_file"]["diversity_study"]["exact_match"]: n11 = self.n11(classifier_1, classifier_2) n00 = self.n00(classifier_1, classifier_2) n10 = self.n10(classifier_1, classifier_2) n01 = self.n01(classifier_1, classifier_2) elif context["results"]["to_file"]["diversity_study"]["by_class"]: # TODO: change this part for i in range(len(self.measures[classifier_1]["matrix"]["oracle_outputs"][0])): n11 = sum([1 if x == y and x == 1 else 0 for x, y in zip(self.measures[classifier_1]["matrix"]["oracle_outputs"], self.measures[classifier_2]["matrix"]["oracle_outputs"])]) n00 = sum( [1 if x == y and x == 0 else 0 for x, y in zip(self.measures[classifier_1]["matrix"]["oracle_outputs"], self.measures[classifier_2]["matrix"]["oracle_outputs"])]) n01 = sum([1 if x != y and x == 0 and y == 1 else 0 for x, y in zip(self.measures[classifier_1]["matrix"]["oracle_outputs"], self.measures[classifier_2]["matrix"]["oracle_outputs"])]) n10 = sum([1 if x != y and x == 1 and y == 0 else 0 for x, y in zip(self.measures[classifier_1]["matrix"]["oracle_outputs"], self.measures[classifier_2]["matrix"]["oracle_outputs"])]) else: raise ValueError("No option selected in diversity study: by class or by exact match") return {"n11": n11, "N00": n00, "N01": n01, "N10": n10} ############################################# def interrater_agreement_k(self, classifier_1, classifier_2, context): values = self._n_values(classifier_1, classifier_2, context) denominator = ((values["N11"] + values["N10"]) * (values["N01"] + values["N00"])) + \ ((values["N11"] + values["N01"]) * (values["N10"] + values["N00"])) numerator = 2 * ((values["N11"] * values["N00"]) - (values["N01"] * values["N10"])) return numerator / denominator ############################################# def q_statistic(self, classifier_1, classifier_2, context): values = self._n_values(classifier_1, classifier_2, context) denominator = values["N11"] * values["N00"] + values["N01"] * values["N10"] if not denominator: denominator = 1 return (values["N11"] * values["N00"] - values["N01"] * values["N10"]) / denominator ############################################# def coefficient_p(self, classifier_1, classifier_2, context): values = self._n_values(classifier_1, classifier_2, context) denominator = np.sqrt((values["N11"] + values["N10"]) * (values["N01"] + values["N00"]) * ( values["N11"] + values["N01"]) * (values["N10"] + values["N00"])) if not denominator: denominator = 1 return (values["N11"] * values["N00"] - values["N01"] * values["N10"]) / denominator ############################################# def disagreement(self, classifier_1, classifier_2, context): values = self._n_values(classifier_1, classifier_2, context) denominator = values["N11"] * values["N00"] + values["N01"] + values["N10"] if not denominator: denominator = 1 return (values["N01"] + values["N10"]) / denominator ############################################# def double_fault(self, classifier_1, classifier_2, context): values = self._n_values(classifier_1, classifier_2, context) denominator = values["N11"] + values["N10"] + values["N01"] + values["N00"] if not denominator: denominator = 1 return values["N00"] / denominator ################################################################ def configuration_evaluation(self, context, classifier_name, information): """ To be reconstructed into a abstraction model. Initialize the information of each classifier. """ #information_class.automatic_threshold_determine(context,classifier_name) pattern_kind = "validation" self.rms(classifier_name, context, information, pattern_kind) name = classifier_name[:re.search(r'[A-Za-z]+[0-9]*', classifier_name).end()] neurons = context["classifiers"][classifier_name]["configuration"]["neurons"][0] if len(self.measures[name]["evaluation"][neurons].keys()): self.measures[name]["evaluation"][neurons]['rms'].append( self.measures[classifier_name]['rms'][pattern_kind]) self.measures[name]["evaluation"][neurons]['names'].append(classifier_name) else: self.measures[name]["evaluation"][neurons]['rms'] = [] self.measures[name]["evaluation"][neurons]['rms'].append( self.measures[classifier_name]['rms'][pattern_kind]) self.measures[name]["evaluation"][neurons]['names'] = [] self.measures[name]["evaluation"][neurons]['names'].append(classifier_name) #################################################### def best_choice(self): """ Select the best configuration of a NN classifier with the class attributes information. """ for name in sorted([x for x in self.measures.keys() if "evaluation" in self.measures[x].keys()]): self.measures[name]["selection"]["rms"] = [99999.0] self.measures[name]["selection"]["neurons"]["hidden"] = [0] self.measures[name]["selection"]["name"] = [""] for neuron in sorted(self.measures[name]["evaluation"].keys()): self.measures[name]["selection"]["neurons"][neuron]["amount"] = 0 rms_list, names_list = (list(t) for t in zip(*sorted(zip(self.measures[ name]["evaluation"][neuron]['rms'], self.measures[name]["evaluation"][neuron][ 'names'])))) mean_rms = np.mean(self.measures[name]["evaluation"][neuron]['rms']) if mean_rms < self.measures[name]["selection"]["rms"][0]: self.measures[name]["selection"]["rms"] = [mean_rms] self.measures[name]["selection"]["neurons"]["hidden"] = [neuron] self.measures[name]["selection"]["neurons"][neuron]["amount"] = 1 self.measures[name]["selection"]["names"] = \ [self.measures[name]["evaluation"][neuron]['names'][self.measures[name]["evaluation"][neuron][ 'rms'].index(sorted( self.measures[name]["evaluation"][neuron][ 'rms'])[0])]] elif mean_rms == self.measures[name]["selection"]["rms"][0]: self.measures[name]["selection"]["rms"].append(mean_rms) self.measures[name]["selection"]["neurons"]["hidden"].append(neuron) for i in range(len(self.measures[name]["evaluation"][neuron]['rms'])): if rms_list[i] == rms_list[0]: self.measures[name]["selection"]["names"].append(names_list[i]) self.measures[name]["selection"]["neurons"][neuron]["amount"] += 1 ################################################################ @staticmethod def pre_forecasting_statistic(context, classifier_name, information, pattern_kind): len_classes = len(context["classifiers"][classifier_name]["classes_names"]) len_inputs = len(context["patterns"].patterns[classifier_name][pattern_kind][0]) - len_classes classifier_outputs = information.info[classifier_name]["continuous_outputs"][pattern_kind] classifier_patterns = \ context["patterns"].patterns[classifier_name][pattern_kind][:, (len_inputs - 1, len_inputs)] len_patterns = len(context["patterns"].patterns[classifier_name][pattern_kind]) d_change_pred = np.zeros(len_patterns) d_change_true = np.zeros(len_patterns) for i, instance, outputs in zip(range(len_patterns), classifier_patterns, classifier_outputs): d_change_true[i] = instance[1] - instance[0] d_change_pred[i] = outputs[0] - instance[0] return d_change_pred, d_change_true ################################################################ def tendency_accuracy(self, classifier_name, context, information, pattern_kind): """ Calculates the number of tends hits on a regression problem. The regression tolerance is a parameter added to avoid the errors due to overflow :param classifier_name: :param context: :param information: :param pattern_kind: :return: """ array_change_pred, array_change_true = Statistics().pre_forecasting_statistic(context, classifier_name, information, pattern_kind) hits = np.zeros(len(array_change_pred)) for i, d_change_pred, d_change_true in zip(range(len(array_change_pred)), array_change_pred, array_change_true): if d_change_pred * d_change_true > 0.0: hits[i] = 1. elif d_change_pred * d_change_true == 0.0: hits[i] = 1. else: if np.sqrt(np.abs(d_change_pred * d_change_true)) < context["regression_tolerance_tendency"]: hits[i] = 1. else: hits[i] = 0. self.measures[classifier_name]["tendency_accuracy"] = np.mean(hits) ######################################################################################### def mase(self, classifier_name, context, information, pattern_kind): """ Mean Absolute error. Returns the inverse of the mase with a denominator that sums 1 to the error. It is intended to give an error between 1 and 0, where the 1 is the lowest error and 0.0 the highest in order to be compatible ordering different measures in the presentations. :param classifier_name: :param context: :param information: :param pattern_kind: :return: """ array_change_pred, array_change_true = self.pre_forecasting_statistic(context, classifier_name, information, pattern_kind) self.measures[classifier_name]["mase"] = np.divide(np.mean(np.absolute(array_change_pred)), np.mean(np.absolute(array_change_true)))
class Statistics: """ The class where are defined all statistics functions as goodness, standard deviation or mean square error. All the information relative to the classifiers is saved on the class structure indexable by name """ def __init__(self): """ Initialize the internal structure as AutoVivification class """ self.measures = AutoVivification() ##################################################### @staticmethod def change_ranges(value, **kwargs): """ Project a given value, from old ranges to new ranges """ if len(kwargs.keys()) != 4: raise ValueError("Change ranges need 4 parameters") old_min = kwargs["oldMin"] old_max = kwargs["oldMax"] new_max = kwargs["newMax"] new_min = kwargs["newMin"] old_range = old_max - old_min new_range = new_max - new_min old_value = value return (((old_value - old_min) * new_range) / old_range) + new_min ############################################# def rms(self, classifier_name, context, information, pattern_kind): """ Calculate all rms to different patterns kind relative to the classifier. """ list_outputs_classifier = information.info[classifier_name][ "continuous_outputs"][pattern_kind] self.measures[classifier_name]["rms"][pattern_kind] = 0.0 pattern = copy.deepcopy( context["patterns"].patterns[classifier_name][pattern_kind]) #Difference between desired outputs(patterns) and the real outputs classes_texts = context["classifiers"][classifier_name][ "classes_names"] len_inputs = len(pattern[0]) - len(classes_texts) for outputs, desired in zip(list_outputs_classifier, pattern): if context["classifiers"][classifier_name]["patterns"][ "range"] is not [0, 1]: for i, desire in enumerate(desired[len_inputs:]): desired[len_inputs:][i] = \ self.change_ranges( desire, oldMin=context["classifiers"][classifier_name]["patterns"]["range"][0], oldMax=context["classifiers"][classifier_name]["patterns"]["range"][1], newMin=0, newMax=1) self.measures[classifier_name]["rms"][pattern_kind] += sum( 0.5 * (desired[len_inputs:] - outputs)**2) self.measures[classifier_name]["rms"][pattern_kind] /= float( len(pattern)) ############################################# @staticmethod def discretize_outputs(value): """ Used like a lambda function """ if value == -1: return 0. return value ############################################# def initialize_goodness(self, context, classifier_name, instances_number, classes_names): #Initialize the structure of goodness values. for values_kind in ['fp', 'fn', 'tp', 'tn']: self.measures[classifier_name]["matrix"][values_kind] = \ np.zeros([instances_number, len(classes_names)], dtype=np.float16) self.measures[classifier_name][values_kind] = 0.0 for class_text in classes_names: self.measures[classifier_name][class_text][values_kind] = 0.0 ############################################# def build_list_oracle_outputs(self, classifier_name): self.measures[classifier_name]["matrix"]["oracle_outputs"] = \ self.measures[classifier_name]["matrix"]["tp"] + self.measures[classifier_name]["matrix"]["tn"] ############################################# def goodness(self, context, classifier_name, list_outputs_classifier, pattern_outputs): """ Calculate the goodness of the classifier. It contain an error formula to penalize more the instances with one class, and less with more classes presents in the same instances. It is a generalization of the multiclass problem. Calculate the goodness in terms of FP, FN, TP, TN and different kinds of error as global error, false positive error, false negative error. """ #TODO:Change the input parameters from list outputs and patterns to Information if not len(pattern_outputs): raise NameError( 'Statistics doesnt get the patterns of the classifier %s correctly at dir %s' % (classifier_name, context["classifiers"][classifier_name]["paths"]["patterns"])) if not len(list_outputs_classifier): raise NameError( 'Statistics doesnt get the outputs of the classifier %s correctly' % classifier_name) if len(list_outputs_classifier) != len(pattern_outputs): raise NameError( 'Different lengths in patterns and outputs on classifier %s' % classifier_name) ############################################# #To improve code readability classes_names = context["classifiers"][classifier_name][ "classes_names"] instances_number = float(len(pattern_outputs)) len_inputs = len(pattern_outputs[0]) - len(classes_names) self.initialize_goodness(context, classifier_name, int(instances_number), classes_names) ############################################# #Measure the error by instance for instance in range(int(instances_number)): #Number of classes present in an instance. For multilabel problems for output_index, class_text in enumerate(classes_names): output_wanted = pattern_outputs[instance][len_inputs:][ output_index] output = list_outputs_classifier[instance][output_index] if output == (-1.): output = 0. if output_wanted == (-1.): output_wanted = 0. #If there is an error if output_wanted != output: #If output wanted was activated means a FN if output_wanted == 1.0: #FN self.measures[classifier_name]["matrix"]['fn'][ instance][output_index] = 1. else: # If not output wanted was activated means a FP self.measures[classifier_name]["matrix"]['fp'][ instance][output_index] = 1. #No error else: #TP if output_wanted == 1.0: self.measures[classifier_name]["matrix"]['tp'][ instance][output_index] = 1. #TN else: self.measures[classifier_name]["matrix"]['tn'][ instance][output_index] = 1. ############################################# #The goodness values in terms of sum of the instances for good in self.measures[classifier_name]["matrix"].keys(): self.measures[classifier_name][good] = np.sum( self.measures[classifier_name]["matrix"][good]) for output_index, class_text in enumerate(classes_names): self.measures[classifier_name][class_text][good] = \ np.sum(self.measures[classifier_name]["matrix"][good], 0)[output_index] ######################################################################################### def error_fn(self, classifier_name, context, information, pattern_kind): pattern_outputs = context["patterns"].patterns[classifier_name][ pattern_kind] classes_names = context["classifiers"][classifier_name][ "classes_names"] self.measures[classifier_name]["error_fn"] = 0.0 for class_text in classes_names: self.measures[classifier_name][class_text]["error_fn"] = 0.0 for output_index, class_text in enumerate(classes_names): num_instances_of_the_class = np.sum([ self.measures[classifier_name]["matrix"]['tp'][i][output_index] + self.measures[classifier_name]["matrix"]['fn'][i][output_index] for i in range(len(pattern_outputs)) ]) #The error depends on the number of instances of it class and on the total number of classes if len(classes_names) == 1: self.measures[classifier_name][class_text]["error_fn"] = \ 0.5 * np.sum(self.measures[classifier_name]["matrix"]["fn"], 0)[output_index] / \ num_instances_of_the_class self.measures[classifier_name][class_text]["error_fn"] = \ 0.5 * np.sum(self.measures[classifier_name]["matrix"]["fn"], 0)[output_index] / \ (float(len(pattern_outputs)) - num_instances_of_the_class) else: self.measures[classifier_name][class_text]["error_fn"] = \ (np.sum(self.measures[classifier_name]["matrix"]["fn"], 0)[output_index] / num_instances_of_the_class) / len(classes_names) for class_text in classes_names: self.measures[classifier_name]["error_fn"] += self.measures[ classifier_name][class_text]["error_fn"] ######################################################################################### def error_fp(self, classifier_name, context, information, pattern_kind): pattern_outputs = context["patterns"].patterns[classifier_name][ pattern_kind] classes_names = context["classifiers"][classifier_name][ "classes_names"] self.measures[classifier_name]["error_fp"] = 0.0 for class_text in classes_names: self.measures[classifier_name][class_text]["error_fp"] = 0.0 for output_index, class_text in enumerate(classes_names): num_instances_of_the_class = np.sum([ self.measures[classifier_name]["matrix"]['tp'][i][output_index] + self.measures[classifier_name]["matrix"]['fn'][i][output_index] for i in range(len(pattern_outputs)) ]) #The error depends on the number of instances of it class and on the total number of classes if len(classes_names) == 1: self.measures[classifier_name][class_text]["error_fp"] = \ 0.5 * np.sum(self.measures[classifier_name]["matrix"]["FP"], 0)[output_index] / \ num_instances_of_the_class self.measures[classifier_name][class_text]["error_fp"] = \ 0.5 * np.sum(self.measures[classifier_name]["matrix"]["FP"], 0)[output_index] / \ (float(len(pattern_outputs)) - num_instances_of_the_class) else: self.measures[classifier_name][class_text]["error_fp"] = \ (np.sum(self.measures[classifier_name]["matrix"]["FP"], 0)[output_index] / num_instances_of_the_class) / len(classes_names) for class_text in classes_names: self.measures[classifier_name]["error_fp"] += self.measures[ classifier_name][class_text]["error_fp"] ######################################################################################### def error(self, classifier_name, context, information, pattern_kind): """ Calculate the errors of the classifier given by name. This error compensates the minority class by dividing each error class by the number of instances of that class, and finally divided by the number of classes. """ self.error_fp(classifier_name, context, information, pattern_kind) self.error_fn(classifier_name, context, information, pattern_kind) for class_text in context["classifiers"][classifier_name][ "classes_names"]: self.measures[classifier_name][class_text]["error"] = \ self.measures[classifier_name][class_text]["error_fp"] + \ self.measures[classifier_name][class_text]["error_fn"] self.measures[classifier_name]["error"] = \ self.measures[classifier_name]["error_fp"] + self.measures[classifier_name]["error_fn"] ######################################################################################### def balanced_accuracy(self, classifier_name, context, *args): self.tnr(classifier_name, context) self.tpr(classifier_name, context) for class_text in context["classifiers"][classifier_name][ "classes_names"]: self.measures[classifier_name][class_text]["balanced_accuracy"] = \ (self.measures[classifier_name][class_text]["tpr"] + self.measures[classifier_name][class_text]["tnr"]) / 2. self.measures[classifier_name]["balanced_accuracy"] = \ np.mean([self.measures[classifier_name][x]["balanced_accuracy"] for x in context["classifiers"][classifier_name]["classes_names"]]) ######################################################################################### def g_means(self, classifier_name, context, *args): """ Geometric mean as the sqrt of the sensibility*specificity """ self.tnr(classifier_name, context) self.tpr(classifier_name, context) self.measures[classifier_name]["g_means"] = np.sqrt( np.dot(self.measures[classifier_name]["tnr"], self.measures[classifier_name]["tpr"])) for class_text in context["classifiers"][classifier_name][ "classes_names"]: self.measures[classifier_name][class_text]["g_means"] = \ np.sqrt(np.dot(self.measures[classifier_name][class_text]["tnr"], self.measures[classifier_name][class_text]["tpr"])) ######################################################################################### def tnr(self, classifier_name, context, *args): """ True Negative Rate """ fp = self.measures[classifier_name]["fp"] tn = self.measures[classifier_name]["tn"] if tn + fp > 0: self.measures[classifier_name]["tnr"] = np.divide(tn, tn + fp) else: self.measures[classifier_name]["tnr"] = 0.0 for class_text in context["classifiers"][classifier_name][ "classes_names"]: fp = self.measures[classifier_name][class_text]["fp"] tn = self.measures[classifier_name][class_text]["tn"] if tn + fp > 0: self.measures[classifier_name]["tnr"] = np.divide(tn, tn + fp) else: self.measures[classifier_name]["tnr"] = 0.0 ######################################################################################### def tpr(self, classifier_name, context, *args): """ True Positive Rate """ tp = self.measures[classifier_name]["tp"] fn = self.measures[classifier_name]["fn"] if tp + fn > 0: self.measures[classifier_name]["tpr"] = np.divide(tp, tp + fn) else: self.measures[classifier_name]["tpr"] = 0.0 for class_text in context["classifiers"][classifier_name][ "classes_names"]: tp = self.measures[classifier_name][class_text]["tp"] fn = self.measures[classifier_name][class_text]["fn"] if tp + fn > 0: self.measures[classifier_name]["tpr"] = np.divide(tp, tp + fn) else: self.measures[classifier_name]["tpr"] = 0.0 ######################################################################################### @staticmethod def get_ytrue_ypred(context, information, classifier_name, pattern_kind): len_classes = len(context["classifiers"][context["classifier_list"][0]] ["classes_names"]) len_inputs = len(context["patterns"].patterns[classifier_name] [pattern_kind][0]) - len_classes y_true = list(context["patterns"].patterns[classifier_name] [pattern_kind][:, range(len_inputs, len_inputs + len_classes)]) y_pred = information.info[classifier_name]["continuous_outputs"][ pattern_kind] return y_true, y_pred ######################################################################################### def explained_variance_score(self, classifier_name, context, information, pattern_kind): from sklearn.metrics import explained_variance_score y_true, y_pred = self.get_ytrue_ypred(context, information, classifier_name, pattern_kind) self.measures[classifier_name]["explained_variance_score"] = \ explained_variance_score(y_true, y_pred) ######################################################################################### def mean_absolute_error(self, classifier_name, context, information, pattern_kind): from sklearn.metrics import mean_absolute_error y_true, y_pred = self.get_ytrue_ypred(context, information, classifier_name, pattern_kind) self.measures[classifier_name]["explained_variance_score"] = \ mean_absolute_error(y_true, y_pred) ######################################################################################### def mean_squared_error(self, classifier_name, context, information, pattern_kind): from sklearn.metrics import mean_squared_error y_true, y_pred = self.get_ytrue_ypred(context, information, classifier_name, pattern_kind) self.measures[classifier_name]["mean_squared_error"] = \ mean_squared_error(y_true, y_pred) ######################################################################################### def r2_score(self, classifier_name, context, information, pattern_kind): from sklearn.metrics import r2_score y_true, y_pred = self.get_ytrue_ypred(context, information, classifier_name, pattern_kind) self.measures[classifier_name]["r2_score"] = \ r2_score(y_true, y_pred) ######################################################################################### @staticmethod def confusion_matrix(classifier_name, context, information, pattern_kind): from sklearn.metrics import confusion_matrix confusion_matrix( context["patterns"].patterns[classifier_name][pattern_kind], information.info[classifier_name]["discretized_outputs"] [pattern_kind], context["classifiers"][classifier_name]["classes_names"]) ######################################################################################### @staticmethod def matthews_corrcoef(classifier_name, context, information, pattern_kind): from sklearn.metrics import matthews_corrcoef matthews_corrcoef( context["patterns"].patterns[classifier_name][pattern_kind], information.info[classifier_name]["discretized_outputs"] [pattern_kind]) ######################################################################################### def hamming_loss(self, classifier_name, context, information, pattern_kind): from sklearn.metrics import hamming_loss self.measures[classifier_name]["hamming_loss"] = \ hamming_loss( context["patterns"].patterns[classifier_name][pattern_kind], information.info[classifier_name]["discretized_outputs"][pattern_kind]) ######################################################################################### def kappa(self, classifier_name, *args): self.measures[classifier_name]["kappa"] = \ self.measures[classifier_name]["matrix"]['tp'] + self.measures[classifier_name]["matrix"]['tn'] ######################################################################################### def f_measure(self, classifier_name, *args): self.recall(classifier_name, *args) self.accuracy(classifier_name, *args) self.measures[classifier_name]["f_measure"] = \ (2 * self.measures[classifier_name]["recall"] * self.measures[classifier_name]["accuracy"]) / \ (self.measures[classifier_name]["recall"] + self.measures[classifier_name]["accuracy"]) ######################################################################################### def accuracy(self, classifier_name, *args): self.measures[classifier_name]["accuracy"] = \ self.measures[classifier_name]['tp'] / ( self.measures[classifier_name]['tp'] + self.measures[classifier_name]['fp']) ######################################################################################### def error_rate(self, classifier_name, *args): self.measures[classifier_name]["error_rate"] = \ self.measures[classifier_name]["matrix"]['tp'] + self.measures[classifier_name]["matrix"]['tn'] / \ (np.sum([self.measures[classifier_name]["matrix"][goodness] for goodness in self.measures[classifier_name]["matrix"].keys()])) ######################################################################################### def accuracy_rate(self, classifier_name, *args): self.measures[classifier_name]["accuracy_rate"] = \ self.measures[classifier_name]["matrix"]['fp'] + self.measures[classifier_name]["matrix"]['fn'] / \ (np.sum( [self.measures[classifier_name]["matrix"][goodness] for goodness in self.measures[classifier_name]["matrix"].keys()])) ######################################################################################### def recall(self, classifier_name, *args): self.tpr(classifier_name, *args) self.measures[classifier_name]["recall"] = self.measures[ classifier_name]["tpr"] ######################################################################################### def fn_rate(self, classifier_name, *args): self.measures[classifier_name][ "fn_rate"] = self.measures[classifier_name]["matrix"]['fn'] / ( self.measures[classifier_name]["matrix"]['tp'] + self.measures[classifier_name]["matrix"]['fn']) ######################################################################################### def fp_rate(self, classifier_name, *args): self.measures[classifier_name][ "fp_rate"] = self.measures[classifier_name]["matrix"]['fp'] / ( self.measures[classifier_name]["matrix"]['tn'] + self.measures[classifier_name]["matrix"]['fp']) ######################################################################################### def auc(self, classifier_name, context, information, pattern_kind): from sklearn.metrics import roc_auc_score classes_names = context["classifiers"][classifier_name][ "classes_names"] inputs = len(context["patterns"].patterns[classifier_name] [pattern_kind][0]) - len(classes_names) self.measures[classifier_name]["auc"] = 0.0 for i, class_name in enumerate(classes_names): self.measures[classifier_name][class_name]["auc"] = \ roc_auc_score(context["patterns"].patterns[classifier_name][pattern_kind][:, inputs + i], information.info[classifier_name]["continuous_outputs"][pattern_kind][:, i]) self.measures[classifier_name]["auc"] += self.measures[ classifier_name][class_name]["auc"] self.measures[classifier_name]["auc"] = np.divide( np.mean(self.measures[classifier_name]["auc"]), len(classes_names)) ######################################################################################### def std(self, classifier_name, context, *args): """ Calculate the standard deviation of the classifier passed as args, for each kind of error. Thus, there is a std for false positive error, another to false positive error, etc. """ self.measures[classifier_name]['dt_efp'] = np.std( self.measures[classifier_name]["matrix"]['efp']) self.measures[classifier_name]['dt_efn'] = np.std( self.measures[classifier_name]["matrix"]['efn']) self.measures[classifier_name]['dt_e'] = np.std( self.measures[classifier_name]["matrix"]['efp'] + self.measures[classifier_name]["matrix"]['efn']) for output_index, class_text in enumerate( context["classifiers"][classifier_name]["classes_names"]): self.measures[classifier_name][class_text]['dt_efp'] = \ np.std(self.measures[classifier_name]["matrix"]["efp"], 0)[output_index] self.measures[classifier_name][class_text]['dt_efn'] = \ np.std(self.measures[classifier_name]["matrix"]["efn"], 0)[output_index] self.measures[classifier_name][class_text]['dt_e'] = \ np.std(self.measures[classifier_name]["matrix"]["e"], 0)[output_index] ############################################# @staticmethod def __build_multiple_name(sub_list): name = "" if type(sub_list) != list: for x_tuple in sub_list: name = "+".join([x for x in x_tuple]) else: for i, name_i in enumerate(sub_list): if i == len(sub_list) - 1: name += name_i else: name += name_i + "+" return name ############################################# def correctly_classified(self, sub_list): correctly_classified = np.zeros( len(self.measures[sub_list[0]]["matrix"]["oracle_outputs"])) for i in range( len(self.measures[sub_list[0]]["matrix"]["oracle_outputs"])): for j, classifier_name in enumerate(sub_list): if (np.array(self.measures[classifier_name]["matrix"] ["oracle_outputs"][i]) == np.ones( len(self.measures[classifier_name]["matrix"] ["oracle_outputs"][i]))).all(): correctly_classified[i] += 1 return correctly_classified ############################################# def interrater_agreement_k_non_pairwise(self, context, sub_list): error = 0.0 correctly_classified = self.correctly_classified(sub_list) p = np.sum([self.measures[x]['E'] for x in self.measures if 'E' in self.measures[x]]) / \ (len(sub_list) * len(self.measures[sub_list[0]]["matrix"]["oracle_outputs"])) for i in range(len(correctly_classified)): error += correctly_classified[i] * (len(sub_list) - correctly_classified[i]) if p == 0.0: p = np.exp(100) error /= len(self.measures[sub_list[0]]["matrix"] ["oracle_outputs"]) * (len(sub_list) - 1) * p * (1 - p) return 1 - error ############################################# def difficulty(self, context, sub_list): error = 0.0 correctly_classified = self.correctly_classified(sub_list) mean_errors = np.mean(correctly_classified) for i in range(len(correctly_classified)): error += np.power((correctly_classified[i] - (correctly_classified[i] / mean_errors)), 2) error /= (len(self.measures[sub_list[0]]["matrix"]["oracle_outputs"]) * np.power(len(sub_list), 2)) return 1. - error ############################################# def kohavi_wolpert(self, context, sub_list): error = 0.0 correctly_classified = self.correctly_classified(sub_list) for i in range(len(correctly_classified)): error += correctly_classified[i] * (len(sub_list) - correctly_classified[i]) error /= len(sub_list) error /= (len(self.measures[sub_list[0]]["matrix"]["oracle_outputs"]) * np.power(len(sub_list), 2)) return error ############################################# def entropy(self, context, sub_list): Error = 0.0 correctly_classified = self.correctly_classified(sub_list) for i in range(len(correctly_classified)): Error += (min(correctly_classified[i], len(sub_list) - correctly_classified[i]) / (len(sub_list) - np.ceil(len(sub_list) / 2.))) Error /= len(self.measures[sub_list[0]]["matrix"]["oracle_outputs"]) return Error ############################################# def diversity_non_pairwise_structure(self, context, function, classifier_list): for i, classifier_name in enumerate(classifier_list): if context["interactive"]["activate"]: sys.stdout.write( "\r{0}>".format("Completed:%f%%" % ((float(i) / len(classifier_list)) * 100))) sys.stdout.flush() # name = self.__build_multiple_name(sub_list) self.measures[classifier_name][function] = \ getattr(self, function)(context, context["classifiers"][classifier_name]["classifiers"]) ############################################# def diversity_pairwise_structure(self, context, function, classifier_list): for i, classifier_1 in enumerate(classifier_list): if context["interactive"]["activate"]: sys.stdout.write( "\r{0}>".format("Completed:%f%%" % ((float(i) / len(classifier_list)) * 100))) sys.stdout.flush() for classifier_2 in context["classifiers"].keys(): if "pairwise_diversity" in self.measures[classifier_2].keys() and function in \ self.measures[classifier_2][ "pairwise_diversity"].keys() and classifier_1 in \ self.measures[classifier_2]["pairwise_diversity"][ function].keys(): self.measures[classifier_1]["pairwise_diversity"][function][classifier_2] = \ self.measures[classifier_2]["pairwise_diversity"][function][classifier_1] else: self.measures[classifier_1]["pairwise_diversity"][function][classifier_2] = \ getattr(self, function)(classifier_1, classifier_2, context) vector = [ self.measures[classifier_1]["pairwise_diversity"][function][x] for x in self.measures[classifier_1]["pairwise_diversity"] [function].keys() if x != classifier_1 ] self.measures[classifier_1]["pairwise_diversity"][function][ "mean"] = np.mean(vector) self.measures[classifier_1]["pairwise_diversity"][function][ "median"] = np.median(vector) self.measures[classifier_1]["pairwise_diversity"][function][ "std"] = np.std(vector) self.measures[classifier_1]["pairwise_diversity"][function][ "variance"] = np.var(vector) ############################################# def error_correlation(self, classifier_1, classifier_2, context): return np.corrcoef(self.measures[classifier_1]["matrix"]["e"], self.measures[classifier_2]["matrix"]["e"])[0][1] ############################################# def n01(self, classifier_1, classifier_2): counter = 0 for a, b in zip( self.measures[classifier_1]["matrix"]["oracle_outputs"], self.measures[classifier_2]["matrix"]["oracle_outputs"]): if np.sum(a) < len(a) and np.sum(b) == len(b): counter += 1 return counter ############################################# def n10(self, classifier_1, classifier_2): counter = 0 for a, b in zip( self.measures[classifier_1]["matrix"]["oracle_outputs"], self.measures[classifier_2]["matrix"]["oracle_outputs"]): if np.sum(a) == len(a) and np.sum(b) < len(b): counter += 1 return counter ############################################# def n11(self, classifier_1, classifier_2): counter = 0 for a, b in zip( self.measures[classifier_1]["matrix"]["oracle_outputs"], self.measures[classifier_2]["matrix"]["oracle_outputs"]): if (a == b).all() and np.sum(a) == len(a): counter += 1 return counter ############################################# def n00(self, classifier_1, classifier_2): counter = 0 for a, b in zip( self.measures[classifier_1]["matrix"]["oracle_outputs"], self.measures[classifier_2]["matrix"]["oracle_outputs"]): if np.sum(b) < len(b) and np.sum(a) < len(a): counter += 1 return counter ############################################# def _n_values(self, classifier_1, classifier_2, context): #this results may be divided n11 = None n00 = None n10 = None n01 = None if context["results"]["to_file"]["diversity_study"]["exact_match"]: n11 = self.n11(classifier_1, classifier_2) n00 = self.n00(classifier_1, classifier_2) n10 = self.n10(classifier_1, classifier_2) n01 = self.n01(classifier_1, classifier_2) elif context["results"]["to_file"]["diversity_study"]["by_class"]: # TODO: change this part for i in range( len(self.measures[classifier_1]["matrix"]["oracle_outputs"] [0])): n11 = sum([ 1 if x == y and x == 1 else 0 for x, y in zip( self.measures[classifier_1]["matrix"] ["oracle_outputs"], self.measures[classifier_2] ["matrix"]["oracle_outputs"]) ]) n00 = sum([ 1 if x == y and x == 0 else 0 for x, y in zip( self.measures[classifier_1]["matrix"] ["oracle_outputs"], self.measures[classifier_2] ["matrix"]["oracle_outputs"]) ]) n01 = sum([ 1 if x != y and x == 0 and y == 1 else 0 for x, y in zip( self.measures[classifier_1]["matrix"] ["oracle_outputs"], self.measures[classifier_2] ["matrix"]["oracle_outputs"]) ]) n10 = sum([ 1 if x != y and x == 1 and y == 0 else 0 for x, y in zip( self.measures[classifier_1]["matrix"] ["oracle_outputs"], self.measures[classifier_2] ["matrix"]["oracle_outputs"]) ]) else: raise ValueError( "No option selected in diversity study: by class or by exact match" ) return {"n11": n11, "N00": n00, "N01": n01, "N10": n10} ############################################# def interrater_agreement_k(self, classifier_1, classifier_2, context): values = self._n_values(classifier_1, classifier_2, context) denominator = ((values["N11"] + values["N10"]) * (values["N01"] + values["N00"])) + \ ((values["N11"] + values["N01"]) * (values["N10"] + values["N00"])) numerator = 2 * ((values["N11"] * values["N00"]) - (values["N01"] * values["N10"])) return numerator / denominator ############################################# def q_statistic(self, classifier_1, classifier_2, context): values = self._n_values(classifier_1, classifier_2, context) denominator = values["N11"] * values["N00"] + values["N01"] * values[ "N10"] if not denominator: denominator = 1 return (values["N11"] * values["N00"] - values["N01"] * values["N10"]) / denominator ############################################# def coefficient_p(self, classifier_1, classifier_2, context): values = self._n_values(classifier_1, classifier_2, context) denominator = np.sqrt( (values["N11"] + values["N10"]) * (values["N01"] + values["N00"]) * (values["N11"] + values["N01"]) * (values["N10"] + values["N00"])) if not denominator: denominator = 1 return (values["N11"] * values["N00"] - values["N01"] * values["N10"]) / denominator ############################################# def disagreement(self, classifier_1, classifier_2, context): values = self._n_values(classifier_1, classifier_2, context) denominator = values["N11"] * values["N00"] + values["N01"] + values[ "N10"] if not denominator: denominator = 1 return (values["N01"] + values["N10"]) / denominator ############################################# def double_fault(self, classifier_1, classifier_2, context): values = self._n_values(classifier_1, classifier_2, context) denominator = values["N11"] + values["N10"] + values["N01"] + values[ "N00"] if not denominator: denominator = 1 return values["N00"] / denominator ################################################################ def configuration_evaluation(self, context, classifier_name, information): """ To be reconstructed into a abstraction model. Initialize the information of each classifier. """ #information_class.automatic_threshold_determine(context,classifier_name) pattern_kind = "validation" self.rms(classifier_name, context, information, pattern_kind) name = classifier_name[:re.search(r'[A-Za-z]+[0-9]*', classifier_name). end()] neurons = context["classifiers"][classifier_name]["configuration"][ "neurons"][0] if len(self.measures[name]["evaluation"][neurons].keys()): self.measures[name]["evaluation"][neurons]['rms'].append( self.measures[classifier_name]['rms'][pattern_kind]) self.measures[name]["evaluation"][neurons]['names'].append( classifier_name) else: self.measures[name]["evaluation"][neurons]['rms'] = [] self.measures[name]["evaluation"][neurons]['rms'].append( self.measures[classifier_name]['rms'][pattern_kind]) self.measures[name]["evaluation"][neurons]['names'] = [] self.measures[name]["evaluation"][neurons]['names'].append( classifier_name) #################################################### def best_choice(self): """ Select the best configuration of a NN classifier with the class attributes information. """ for name in sorted([ x for x in self.measures.keys() if "evaluation" in self.measures[x].keys() ]): self.measures[name]["selection"]["rms"] = [99999.0] self.measures[name]["selection"]["neurons"]["hidden"] = [0] self.measures[name]["selection"]["name"] = [""] for neuron in sorted(self.measures[name]["evaluation"].keys()): self.measures[name]["selection"]["neurons"][neuron][ "amount"] = 0 rms_list, names_list = (list(t) for t in zip(*sorted( zip(self.measures[name]["evaluation"][neuron]['rms'], self.measures[name]["evaluation"][neuron]['names'])))) mean_rms = np.mean( self.measures[name]["evaluation"][neuron]['rms']) if mean_rms < self.measures[name]["selection"]["rms"][0]: self.measures[name]["selection"]["rms"] = [mean_rms] self.measures[name]["selection"]["neurons"]["hidden"] = [ neuron ] self.measures[name]["selection"]["neurons"][neuron][ "amount"] = 1 self.measures[name]["selection"]["names"] = \ [self.measures[name]["evaluation"][neuron]['names'][self.measures[name]["evaluation"][neuron][ 'rms'].index(sorted( self.measures[name]["evaluation"][neuron][ 'rms'])[0])]] elif mean_rms == self.measures[name]["selection"]["rms"][0]: self.measures[name]["selection"]["rms"].append(mean_rms) self.measures[name]["selection"]["neurons"][ "hidden"].append(neuron) for i in range( len(self.measures[name]["evaluation"][neuron] ['rms'])): if rms_list[i] == rms_list[0]: self.measures[name]["selection"]["names"].append( names_list[i]) self.measures[name]["selection"]["neurons"][ neuron]["amount"] += 1 ################################################################ @staticmethod def pre_forecasting_statistic(context, classifier_name, information, pattern_kind): len_classes = len( context["classifiers"][classifier_name]["classes_names"]) len_inputs = len(context["patterns"].patterns[classifier_name] [pattern_kind][0]) - len_classes classifier_outputs = information.info[classifier_name][ "continuous_outputs"][pattern_kind] classifier_patterns = \ context["patterns"].patterns[classifier_name][pattern_kind][:, (len_inputs - 1, len_inputs)] len_patterns = len( context["patterns"].patterns[classifier_name][pattern_kind]) d_change_pred = np.zeros(len_patterns) d_change_true = np.zeros(len_patterns) for i, instance, outputs in zip(range(len_patterns), classifier_patterns, classifier_outputs): d_change_true[i] = instance[1] - instance[0] d_change_pred[i] = outputs[0] - instance[0] return d_change_pred, d_change_true ################################################################ def tendency_accuracy(self, classifier_name, context, information, pattern_kind): """ Calculates the number of tends hits on a regression problem. The regression tolerance is a parameter added to avoid the errors due to overflow :param classifier_name: :param context: :param information: :param pattern_kind: :return: """ array_change_pred, array_change_true = Statistics( ).pre_forecasting_statistic(context, classifier_name, information, pattern_kind) hits = np.zeros(len(array_change_pred)) for i, d_change_pred, d_change_true in zip( range(len(array_change_pred)), array_change_pred, array_change_true): if d_change_pred * d_change_true > 0.0: hits[i] = 1. elif d_change_pred * d_change_true == 0.0: hits[i] = 1. else: if np.sqrt(np.abs(d_change_pred * d_change_true) ) < context["regression_tolerance_tendency"]: hits[i] = 1. else: hits[i] = 0. self.measures[classifier_name]["tendency_accuracy"] = np.mean(hits) ######################################################################################### def mase(self, classifier_name, context, information, pattern_kind): """ Mean Absolute error. Returns the inverse of the mase with a denominator that sums 1 to the error. It is intended to give an error between 1 and 0, where the 1 is the lowest error and 0.0 the highest in order to be compatible ordering different measures in the presentations. :param classifier_name: :param context: :param information: :param pattern_kind: :return: """ array_change_pred, array_change_true = self.pre_forecasting_statistic( context, classifier_name, information, pattern_kind) self.measures[classifier_name]["mase"] = np.divide( np.mean(np.absolute(array_change_pred)), np.mean(np.absolute(array_change_true)))