def get_statistics_independently(arff_file): matrix, labels, relation, attributes = am.arff_to_nparray(arff_file) classes = list(set(labels)) labels = labels.reshape(-1, 1) folder, name = os.path.split(arff_file) if folder == "": folder = os.getcwd() stats_names = [ 'max', 'min', 'mean', 'median', 'std', 'var', 'kurt', 'skew', 'percentile25', 'percentile50', 'percentile75' ] for stat in stats_names: indices = [] subname = name.replace(".arff", "_%s" % stat) for attribute in attributes: if attribute.endswith(stat): indices.append(attributes.index(attribute)) submatrix = np.concatenate((matrix[:, indices], labels), axis=-1) subheader = np.concatenate( (np.array(attributes)[indices], np.array(["Class"])), axis=-1).reshape(1, -1) am.create_arff( np.concatenate((subheader, submatrix), axis=0).tolist(), classes, folder, subname, subname)
def mutual_information_evaluation(arffInput): samples, classLabels, relation, attributeNames = am.arff_to_nparray( arffInput) resultMatrix = np.array(["Attribute"] + attributeNames).transpose() scores = np.concatenate((np.array(["Mutual Information"]), mutual_info_classif(samples, classLabels))).transpose() resultMatrix = np.column_stack((resultMatrix, scores)) return resultMatrix
def f_evaluation(arffInput): samples, classLabels, relation, attributeNames = am.arff_to_nparray( arffInput) resultMatrix = np.array(["Attribute"] + attributeNames).transpose() scores = np.concatenate( (np.array(["ANOVA F-value"]), f_classif(samples, classLabels)[0])).transpose() resultMatrix = np.column_stack((resultMatrix, scores)) return resultMatrix
def evaluate_single_features(classifier, arffInput, folds=None): if folds == None: folds = 10 matrix, Y, relation, attributes = am.arff_to_nparray(arffInput) if folds == len(Y): from sklearn.model_selection import LeaveOneOut folds = LeaveOneOut().split(matrix) classes = list(set(Y)) resultMatrix = np.array([["", "Attributes"] + attributes]).transpose() scores = np.array( [[str(classifier).split("(")[0], ""], ["Accuracy", "AUC"]] + [["0", "0"] for i in range(len(attributes))]) for i in range(len(attributes)): X = matrix[:, i].reshape(-1, 1) predictedLabels = model.cross_val_predict( classifier, X, Y, cv=folds, n_jobs=multiprocessing.cpu_count()) # accuracy = round(metrics.accuracy_score(Y, predictedLabels), 3) * 100 accuracy = cross_val_score(classifier, X, Y, cv=folds, scoring="accuracy", n_jobs=multiprocessing.cpu_count()) if isinstance(folds, types.GeneratorType): folds = LeaveOneOut().split(matrix) # auc = metrics.roc_auc_score(preprocessing.label_binarize(Y, classes), # preprocessing.label_binarize(predictedLabels, classes)) # auc = round(auc, 3) try: auc = cross_val_score(classifier, X, preprocessing.label_binarize(Y, classes), cv=folds, scoring="roc_auc", n_jobs=multiprocessing.cpu_count()) auc = round(auc.mean(), 3) except: print("AUC cannot be calculated") auc = 0 if isinstance(folds, types.GeneratorType): folds = LeaveOneOut().split(matrix) scores[i + 2] = np.array([str(accuracy), str(auc)]) resultMatrix = np.column_stack((resultMatrix, scores)) return resultMatrix
def stacking_proba_evaluation(classifier, databasesFolder=None, modalityFiles=None, folds=None, relationName=None): if databasesFolder == None: databasesFolder = "datasets" if folds == None: folds = 10 if relationName == None: relationName = "stacking_proba" if modalityFiles == None: modalityFiles = sorted([ os.path.join(databasesFolder, f) for f in os.listdir(databasesFolder) if os.path.isfile(os.path.join(databasesFolder, f)) and not f.startswith('.') and f[-5:].lower() == ".arff" ], key=lambda f: f.lower()) else: modalityFiles = [ os.path.join(databasesFolder, f) for f in modalityFiles ] try: with open(os.path.join(databasesFolder, "list_of_instances.csv")) as listOfInstances: instanceNames = listOfInstances.readlines() except: print("There was an error reading the list of evaluated instances.") raise print("\nMethod: " + relationName) instanceNames = [name.strip() for name in instanceNames] instanceNames += ["Accuracy", "AUC"] resultMatrix = np.array([[str(classifier).split("(")[0], ""] + instanceNames]).transpose() matrix, labels, relation, attributes = am.arff_to_nparray(modalityFiles[0]) folds = check_cv(folds, labels) stacker = deepcopy(classifier) final_labels = ["None" for i in range(len(labels))] accuracy = [] auc = [] for pair in folds: predictionLists = [] test_predictions = [] views = [] for arffFile in modalityFiles: matrix, labels, relation, attributes = am.arff_to_nparray(arffFile) views.append(relation) classes = list(set(labels)) classes.sort() prediction = [] test_prediction = [] classifier.fit(matrix[pair[0]], labels[pair[0]]) try: probabilities = classifier.predict_proba(matrix[pair[0]]) # probabilities = model.cross_val_predict(classifier, matrix, labels, method='predict_proba', cv=folds) test_probability = classifier.predict_proba(matrix[pair[1]]) except: probabilities = np.array([[0, 0] for i in range(len(labels[pair[0]])) ]) binary = preprocessing.label_binarize( classifier.predict(matrix[pair[0]]), classes=list(reversed(classes))) for i in range(len(binary)): if binary[i, 0] == 0: couple = [0, 1] else: couple = [1, 0] probabilities[i] = np.array(couple) test_probability = np.array( [[0, 0] for i in range(len(labels[pair[1]]))]) binary = preprocessing.label_binarize( classifier.predict(matrix[pair[1]]), classes=list(reversed(classes))) for i in range(len(binary)): if binary[i, 0] == 0: couple = [0, 1] else: couple = [1, 0] test_probability[i] = np.array(couple) for couple in probabilities: labelIndex = couple.tolist().index(max(couple)) if labelIndex == 0: prediction.append(float(couple[labelIndex])) elif labelIndex == 1: prediction.append(float(-couple[labelIndex])) for couple in test_probability: labelIndex = couple.tolist().index(max(couple)) if labelIndex == 0: test_prediction.append(float(couple[labelIndex])) elif labelIndex == 1: test_prediction.append(float(-couple[labelIndex])) prediction = np.array(prediction).reshape(-1, 1) predictionLists.append(prediction) test_prediction = np.array(test_prediction).reshape(-1, 1) test_predictions.append(test_prediction) newMatrix = np.column_stack(tuple(predictionLists)) new_test_matrix = np.column_stack(tuple(test_predictions)) stacker.fit(newMatrix, labels[pair[0]]) predictedLabels = stacker.predict(new_test_matrix) for idx, value in enumerate(predictedLabels): position = pair[1][idx] final_labels[position] = value accuracy.append( metrics.accuracy_score(labels[pair[1]], predictedLabels)) try: auc.append( metrics.roc_auc_score( preprocessing.label_binarize(labels[pair[1]], classes=list( reversed(classes))), preprocessing.label_binarize(predictedLabels, classes=list( reversed(classes))))) except: print("AUC cannot be calculated") auc.append(0) # predictedLabels = model.cross_val_predict(classifier, newMatrix, labels, cv=folds) # accuracy = round(metrics.accuracy_score(labels, predictedLabels)*100, 1) #print metrics.accuracy_score(labels, model.cross_val_predict(classifier, matrix, labels, cv=10)) # auc = metrics.roc_auc_score(preprocessing.label_binarize(labels, classes=list(reversed(classes))), # preprocessing.label_binarize(predictedLabels, classes=list(reversed(classes)))) # auc = round(auc, 3) accuracy = np.array(accuracy) auc = np.array(auc) accuracy = round(accuracy.mean() * 100, 1) auc = round(auc.mean(), 3) print("Accuracy: %s\nAUC: %s" % (accuracy, auc)) #print metrics.roc_auc_score(preprocessing.label_binarize(labels, classes=list(reversed(classes))), # preprocessing.label_binarize(model.cross_val_predict(classifier, matrix, labels, cv=10), # classes=list(reversed(classes)))) final_labels = np.array(final_labels) newColumn = np.array([ np.concatenate((np.array([relationName, "Guess"]), final_labels == labels, np.array([accuracy]), np.array([auc]))) ]) resultMatrix = np.column_stack((resultMatrix, newColumn.transpose())) newColumn = np.array([ np.concatenate((np.array(["", "Real Label"]), labels, np.array(["", ""]))) ]) resultMatrix = np.column_stack((resultMatrix, newColumn.transpose())) resultMatrix[resultMatrix == "True"] = "1" resultMatrix[resultMatrix == "False"] = "0" return resultMatrix
def hard_majority_vote_evaluation(classifier, databasesFolder=None, modalityFiles=None, folds=None, relationName=None): if databasesFolder == None: databasesFolder = "datasets" if folds == None: folds = 10 if relationName == None: relationName = "majority_vote" if modalityFiles == None: modalityFiles = sorted([ os.path.join(databasesFolder, f) for f in os.listdir(databasesFolder) if os.path.isfile(os.path.join(databasesFolder, f)) and not f.startswith('.') and f[-5:].lower() == ".arff" ], key=lambda f: f.lower()) else: modalityFiles = [ os.path.join(databasesFolder, f) for f in modalityFiles ] try: with open(os.path.join(databasesFolder, "list_of_instances.csv")) as listOfInstances: instanceNames = listOfInstances.readlines() except: print("There was an error reading the list of evaluated instances.") raise print("\nMethod: " + relationName) instanceNames = [name.strip() for name in instanceNames] instanceNames += ["Accuracy", "AUC"] resultMatrix = np.array([[str(classifier).split("(")[0], ""] + instanceNames]).transpose() matrix, labels, relation, attributes = am.arff_to_nparray(modalityFiles[0]) folds = check_cv(folds, labels) final_labels = ["None" for i in range(len(labels))] accuracy = [] auc = [] for pair in folds: predictionLists = [] for arffFile in modalityFiles: matrix, labels, relation, attributes = am.arff_to_nparray(arffFile) classes = list(set(labels)) classes.sort() classifier.fit(matrix[pair[0]], labels[pair[0]]) predictionLists.append(classifier.predict(matrix[pair[1]])) # predictionLists.append(model.cross_val_predict(classifier, matrix, labels, cv=folds)) predictedLabels = [] for instance in range(len(predictionLists[0])): votes = [modality[instance] for modality in predictionLists] maxVoted = 0 for classLabel in classes: classVotes = votes.count(classLabel) if classVotes > maxVoted: maxVoted = classVotes winner = classLabel predictedLabels.append(winner) for idx, value in enumerate(predictedLabels): position = pair[1][idx] final_labels[position] = value predictedLabels = np.array(predictedLabels) accuracy.append( metrics.accuracy_score(labels[pair[1]], predictedLabels)) try: auc.append( metrics.roc_auc_score( preprocessing.label_binarize(labels[pair[1]], classes=list( reversed(classes))), preprocessing.label_binarize(predictedLabels, classes=list( reversed(classes))))) except: print("AUC cannot be calculated") auc.append(0) # accuracy = round(metrics.accuracy_score(labels, predictedLabels)*100, 1) # print metrics.accuracy_score(labels, model.cross_val_predict(classifier, matrix, labels, cv=10)) # auc = metrics.roc_auc_score(preprocessing.label_binarize(labels, classes=list(reversed(classes))), # preprocessing.label_binarize(predictedLabels, classes=list(reversed(classes)))) # auc = round(auc, 3) accuracy = np.array(accuracy) auc = np.array(auc) accuracy = round(accuracy.mean() * 100, 1) auc = round(auc.mean(), 3) print("Accuracy: %s\nAUC: %s" % (accuracy, auc)) #print metrics.roc_auc_score(preprocessing.label_binarize(labels, classes=list(reversed(classes))), # preprocessing.label_binarize(model.cross_val_predict(classifier, matrix, labels, cv=10), # classes=list(reversed(classes)))) final_labels = np.array(final_labels) newColumn = np.array([ np.concatenate((np.array([relationName, "Guess"]), final_labels == labels, np.array([accuracy]), np.array([auc]))) ]) resultMatrix = np.column_stack((resultMatrix, newColumn.transpose())) newColumn = np.array([ np.concatenate((np.array(["", "Real Label"]), labels, np.array(["", ""]))) ]) resultMatrix = np.column_stack((resultMatrix, newColumn.transpose())) resultMatrix[resultMatrix == "True"] = "1" resultMatrix[resultMatrix == "False"] = "0" return resultMatrix
def complementarity_analysis(classifier, databasesFolder=None, modalityFiles=None, exceptions=[], folds=None, showProba=None): if databasesFolder == None: databasesFolder = "datasets" if folds == None: folds = 10 if showProba == None: showProba = False if len(exceptions) == 0: exceptions = ["early_fusion.arff", "syntax_informed.arff"] if modalityFiles == None: modalityFiles = sorted([ os.path.join(databasesFolder, f) for f in os.listdir(databasesFolder) if os.path.isfile(os.path.join(databasesFolder, f)) and not f.startswith('.') and f[-5:].lower() == ".arff" and not f in exceptions ], key=lambda f: f.lower()) else: modalityFiles = [ os.path.join(databasesFolder, f) for f in modalityFiles ] try: with open(os.path.join(databasesFolder, "list_of_instances.csv")) as listOfInstances: instanceNames = listOfInstances.readlines() except: print("There was an error reading the list of evaluated instances.") raise instanceNames = [name.strip() for name in instanceNames] if folds == len(instanceNames): from sklearn.model_selection import LeaveOneOut indices = [i for i in range(len(instanceNames))] folds = LeaveOneOut().split(indices) instanceNames += ["Accuracy", "AUC"] resultMatrix = np.array([[str(classifier).split("(")[0], ""] + instanceNames]).transpose() for arffFile in modalityFiles: matrix, labels, relation, attributes = am.arff_to_nparray(arffFile) classes = list(set(labels)) classes.sort() print("\n" + str(classifier).split("(")[0]) print("Relation: " + relation) predictedLabels = model.cross_val_predict( classifier, matrix, labels, cv=folds, n_jobs=multiprocessing.cpu_count()) if isinstance(folds, types.GeneratorType): folds = LeaveOneOut().split(indices) if showProba: try: probabilities = model.cross_val_predict( classifier, matrix, labels, method='predict_proba', cv=folds, n_jobs=multiprocessing.cpu_count()) if isinstance(folds, types.GeneratorType): folds = LeaveOneOut().split(indices) except: probabilities = np.array([[0, 0] for i in range(len(labels))]) binary = preprocessing.label_binarize(predictedLabels, classes=list( reversed(classes))) for i in range(len(binary)): if binary[i, 0] == 0: tuple = [0, 1] else: tuple = [1, 0] probabilities[i] = np.array(tuple) print(confusion_matrix(labels, predictedLabels)) # accuracy = round(metrics.accuracy_score(labels, predictedLabels)*100, 1) accuracy = cross_val_score(classifier, matrix, labels, cv=folds, scoring="accuracy", n_jobs=multiprocessing.cpu_count()) accuracy = round(accuracy.mean() * 100, 1) if isinstance(folds, types.GeneratorType): folds = LeaveOneOut().split(indices) #print metrics.accuracy_score(labels, model.cross_val_predict(classifier, matrix, labels, cv=10)) #auc = metrics.roc_auc_score(preprocessing.label_binarize(labels, classes=list(reversed(classes))), # preprocessing.label_binarize(predictedLabels, classes=list(reversed(classes)))) #auc = round(auc, 3) try: auc = cross_val_score(classifier, matrix, preprocessing.label_binarize( labels, classes), cv=folds, scoring="roc_auc", n_jobs=multiprocessing.cpu_count()) auc = round(auc.mean(), 3) except: print("AUC cannot be calculated") auc = 0 print("Accuracy: %s\nAUC: %s" % (accuracy, auc)) if isinstance(folds, types.GeneratorType): folds = LeaveOneOut().split(indices) #print metrics.roc_auc_score(preprocessing.label_binarize(labels, classes=list(reversed(classes))), # preprocessing.label_binarize(model.cross_val_predict(classifier, matrix, labels, cv=10), # classes=list(reversed(classes)))) if showProba: newColumn = np.concatenate((np.array([ ["", ""], [classes[0] + " probability", classes[1] + " probability"] ]), probabilities, np.array([["", ""], ["", ""]]))) resultMatrix = np.column_stack((resultMatrix, newColumn)) newColumn = np.array([ np.concatenate((np.array([relation, "Guess"]), predictedLabels == labels, np.array([accuracy]), np.array([auc]))) ]) resultMatrix = np.column_stack((resultMatrix, newColumn.transpose())) newColumn = np.array([ np.concatenate((np.array(["", "Real Label"]), labels, np.array(["", ""]))) ]) resultMatrix = np.column_stack((resultMatrix, newColumn.transpose())) resultMatrix[resultMatrix == "True"] = "1" resultMatrix[resultMatrix == "False"] = "0" return resultMatrix