コード例 #1
0
def get_statistics_independently(arff_file):
    matrix, labels, relation, attributes = am.arff_to_nparray(arff_file)
    classes = list(set(labels))
    labels = labels.reshape(-1, 1)
    folder, name = os.path.split(arff_file)
    if folder == "":
        folder = os.getcwd()
    stats_names = [
        'max', 'min', 'mean', 'median', 'std', 'var', 'kurt', 'skew',
        'percentile25', 'percentile50', 'percentile75'
    ]

    for stat in stats_names:
        indices = []
        subname = name.replace(".arff", "_%s" % stat)
        for attribute in attributes:
            if attribute.endswith(stat):
                indices.append(attributes.index(attribute))
        submatrix = np.concatenate((matrix[:, indices], labels), axis=-1)
        subheader = np.concatenate(
            (np.array(attributes)[indices], np.array(["Class"])),
            axis=-1).reshape(1, -1)
        am.create_arff(
            np.concatenate((subheader, submatrix), axis=0).tolist(), classes,
            folder, subname, subname)
コード例 #2
0
ファイル: machine_learning.py プロジェクト: aascode/DDV
def mutual_information_evaluation(arffInput):

    samples, classLabels, relation, attributeNames = am.arff_to_nparray(
        arffInput)
    resultMatrix = np.array(["Attribute"] + attributeNames).transpose()
    scores = np.concatenate((np.array(["Mutual Information"]),
                             mutual_info_classif(samples,
                                                 classLabels))).transpose()
    resultMatrix = np.column_stack((resultMatrix, scores))
    return resultMatrix
コード例 #3
0
ファイル: machine_learning.py プロジェクト: aascode/DDV
def f_evaluation(arffInput):

    samples, classLabels, relation, attributeNames = am.arff_to_nparray(
        arffInput)
    resultMatrix = np.array(["Attribute"] + attributeNames).transpose()
    scores = np.concatenate(
        (np.array(["ANOVA F-value"]), f_classif(samples,
                                                classLabels)[0])).transpose()
    resultMatrix = np.column_stack((resultMatrix, scores))
    return resultMatrix
コード例 #4
0
ファイル: machine_learning.py プロジェクト: aascode/DDV
def evaluate_single_features(classifier, arffInput, folds=None):

    if folds == None:
        folds = 10

    matrix, Y, relation, attributes = am.arff_to_nparray(arffInput)
    if folds == len(Y):
        from sklearn.model_selection import LeaveOneOut
        folds = LeaveOneOut().split(matrix)
    classes = list(set(Y))
    resultMatrix = np.array([["", "Attributes"] + attributes]).transpose()
    scores = np.array(
        [[str(classifier).split("(")[0], ""], ["Accuracy", "AUC"]] +
        [["0", "0"] for i in range(len(attributes))])
    for i in range(len(attributes)):
        X = matrix[:, i].reshape(-1, 1)
        predictedLabels = model.cross_val_predict(
            classifier, X, Y, cv=folds, n_jobs=multiprocessing.cpu_count())
        # accuracy = round(metrics.accuracy_score(Y, predictedLabels), 3) * 100
        accuracy = cross_val_score(classifier,
                                   X,
                                   Y,
                                   cv=folds,
                                   scoring="accuracy",
                                   n_jobs=multiprocessing.cpu_count())
        if isinstance(folds, types.GeneratorType):
            folds = LeaveOneOut().split(matrix)
        # auc = metrics.roc_auc_score(preprocessing.label_binarize(Y, classes),
        #                             preprocessing.label_binarize(predictedLabels, classes))
        # auc = round(auc, 3)
        try:
            auc = cross_val_score(classifier,
                                  X,
                                  preprocessing.label_binarize(Y, classes),
                                  cv=folds,
                                  scoring="roc_auc",
                                  n_jobs=multiprocessing.cpu_count())
            auc = round(auc.mean(), 3)
        except:
            print("AUC cannot be calculated")
            auc = 0
        if isinstance(folds, types.GeneratorType):
            folds = LeaveOneOut().split(matrix)
        scores[i + 2] = np.array([str(accuracy), str(auc)])
    resultMatrix = np.column_stack((resultMatrix, scores))
    return resultMatrix
コード例 #5
0
ファイル: machine_learning.py プロジェクト: aascode/DDV
def stacking_proba_evaluation(classifier,
                              databasesFolder=None,
                              modalityFiles=None,
                              folds=None,
                              relationName=None):

    if databasesFolder == None:
        databasesFolder = "datasets"
    if folds == None:
        folds = 10
    if relationName == None:
        relationName = "stacking_proba"
    if modalityFiles == None:
        modalityFiles = sorted([
            os.path.join(databasesFolder, f)
            for f in os.listdir(databasesFolder)
            if os.path.isfile(os.path.join(databasesFolder, f))
            and not f.startswith('.') and f[-5:].lower() == ".arff"
        ],
                               key=lambda f: f.lower())
    else:
        modalityFiles = [
            os.path.join(databasesFolder, f) for f in modalityFiles
        ]
    try:
        with open(os.path.join(databasesFolder,
                               "list_of_instances.csv")) as listOfInstances:
            instanceNames = listOfInstances.readlines()
    except:
        print("There was an error reading the list of evaluated instances.")
        raise
    print("\nMethod: " + relationName)
    instanceNames = [name.strip() for name in instanceNames]
    instanceNames += ["Accuracy", "AUC"]

    resultMatrix = np.array([[str(classifier).split("(")[0], ""] +
                             instanceNames]).transpose()
    matrix, labels, relation, attributes = am.arff_to_nparray(modalityFiles[0])
    folds = check_cv(folds, labels)
    stacker = deepcopy(classifier)
    final_labels = ["None" for i in range(len(labels))]
    accuracy = []
    auc = []
    for pair in folds:
        predictionLists = []
        test_predictions = []
        views = []
        for arffFile in modalityFiles:
            matrix, labels, relation, attributes = am.arff_to_nparray(arffFile)
            views.append(relation)
            classes = list(set(labels))
            classes.sort()
            prediction = []
            test_prediction = []
            classifier.fit(matrix[pair[0]], labels[pair[0]])
            try:
                probabilities = classifier.predict_proba(matrix[pair[0]])
                # probabilities = model.cross_val_predict(classifier, matrix, labels, method='predict_proba', cv=folds)
                test_probability = classifier.predict_proba(matrix[pair[1]])
            except:
                probabilities = np.array([[0, 0]
                                          for i in range(len(labels[pair[0]]))
                                          ])
                binary = preprocessing.label_binarize(
                    classifier.predict(matrix[pair[0]]),
                    classes=list(reversed(classes)))
                for i in range(len(binary)):
                    if binary[i, 0] == 0:
                        couple = [0, 1]
                    else:
                        couple = [1, 0]
                    probabilities[i] = np.array(couple)

                test_probability = np.array(
                    [[0, 0] for i in range(len(labels[pair[1]]))])
                binary = preprocessing.label_binarize(
                    classifier.predict(matrix[pair[1]]),
                    classes=list(reversed(classes)))
                for i in range(len(binary)):
                    if binary[i, 0] == 0:
                        couple = [0, 1]
                    else:
                        couple = [1, 0]
                    test_probability[i] = np.array(couple)

            for couple in probabilities:
                labelIndex = couple.tolist().index(max(couple))
                if labelIndex == 0:
                    prediction.append(float(couple[labelIndex]))
                elif labelIndex == 1:
                    prediction.append(float(-couple[labelIndex]))

            for couple in test_probability:
                labelIndex = couple.tolist().index(max(couple))
                if labelIndex == 0:
                    test_prediction.append(float(couple[labelIndex]))
                elif labelIndex == 1:
                    test_prediction.append(float(-couple[labelIndex]))

            prediction = np.array(prediction).reshape(-1, 1)
            predictionLists.append(prediction)
            test_prediction = np.array(test_prediction).reshape(-1, 1)
            test_predictions.append(test_prediction)
        newMatrix = np.column_stack(tuple(predictionLists))
        new_test_matrix = np.column_stack(tuple(test_predictions))
        stacker.fit(newMatrix, labels[pair[0]])
        predictedLabels = stacker.predict(new_test_matrix)
        for idx, value in enumerate(predictedLabels):
            position = pair[1][idx]
            final_labels[position] = value
        accuracy.append(
            metrics.accuracy_score(labels[pair[1]], predictedLabels))
        try:
            auc.append(
                metrics.roc_auc_score(
                    preprocessing.label_binarize(labels[pair[1]],
                                                 classes=list(
                                                     reversed(classes))),
                    preprocessing.label_binarize(predictedLabels,
                                                 classes=list(
                                                     reversed(classes)))))
        except:
            print("AUC cannot be calculated")
            auc.append(0)
    # predictedLabels = model.cross_val_predict(classifier, newMatrix, labels, cv=folds)
    # accuracy = round(metrics.accuracy_score(labels, predictedLabels)*100, 1)
    #print metrics.accuracy_score(labels, model.cross_val_predict(classifier, matrix, labels, cv=10))
    # auc = metrics.roc_auc_score(preprocessing.label_binarize(labels, classes=list(reversed(classes))),
    #                             preprocessing.label_binarize(predictedLabels, classes=list(reversed(classes))))
    # auc = round(auc, 3)
    accuracy = np.array(accuracy)
    auc = np.array(auc)
    accuracy = round(accuracy.mean() * 100, 1)
    auc = round(auc.mean(), 3)
    print("Accuracy: %s\nAUC: %s" % (accuracy, auc))
    #print metrics.roc_auc_score(preprocessing.label_binarize(labels, classes=list(reversed(classes))),
    #                            preprocessing.label_binarize(model.cross_val_predict(classifier, matrix, labels, cv=10),
    #                                                         classes=list(reversed(classes))))
    final_labels = np.array(final_labels)
    newColumn = np.array([
        np.concatenate((np.array([relationName,
                                  "Guess"]), final_labels == labels,
                        np.array([accuracy]), np.array([auc])))
    ])
    resultMatrix = np.column_stack((resultMatrix, newColumn.transpose()))
    newColumn = np.array([
        np.concatenate((np.array(["",
                                  "Real Label"]), labels, np.array(["", ""])))
    ])
    resultMatrix = np.column_stack((resultMatrix, newColumn.transpose()))
    resultMatrix[resultMatrix == "True"] = "1"
    resultMatrix[resultMatrix == "False"] = "0"
    return resultMatrix
コード例 #6
0
ファイル: machine_learning.py プロジェクト: aascode/DDV
def hard_majority_vote_evaluation(classifier,
                                  databasesFolder=None,
                                  modalityFiles=None,
                                  folds=None,
                                  relationName=None):

    if databasesFolder == None:
        databasesFolder = "datasets"
    if folds == None:
        folds = 10
    if relationName == None:
        relationName = "majority_vote"
    if modalityFiles == None:
        modalityFiles = sorted([
            os.path.join(databasesFolder, f)
            for f in os.listdir(databasesFolder)
            if os.path.isfile(os.path.join(databasesFolder, f))
            and not f.startswith('.') and f[-5:].lower() == ".arff"
        ],
                               key=lambda f: f.lower())
    else:
        modalityFiles = [
            os.path.join(databasesFolder, f) for f in modalityFiles
        ]
    try:
        with open(os.path.join(databasesFolder,
                               "list_of_instances.csv")) as listOfInstances:
            instanceNames = listOfInstances.readlines()
    except:
        print("There was an error reading the list of evaluated instances.")
        raise
    print("\nMethod: " + relationName)
    instanceNames = [name.strip() for name in instanceNames]
    instanceNames += ["Accuracy", "AUC"]

    resultMatrix = np.array([[str(classifier).split("(")[0], ""] +
                             instanceNames]).transpose()
    matrix, labels, relation, attributes = am.arff_to_nparray(modalityFiles[0])
    folds = check_cv(folds, labels)
    final_labels = ["None" for i in range(len(labels))]
    accuracy = []
    auc = []
    for pair in folds:
        predictionLists = []
        for arffFile in modalityFiles:
            matrix, labels, relation, attributes = am.arff_to_nparray(arffFile)
            classes = list(set(labels))
            classes.sort()
            classifier.fit(matrix[pair[0]], labels[pair[0]])
            predictionLists.append(classifier.predict(matrix[pair[1]]))
            # predictionLists.append(model.cross_val_predict(classifier, matrix, labels, cv=folds))
        predictedLabels = []
        for instance in range(len(predictionLists[0])):
            votes = [modality[instance] for modality in predictionLists]
            maxVoted = 0
            for classLabel in classes:
                classVotes = votes.count(classLabel)
                if classVotes > maxVoted:
                    maxVoted = classVotes
                    winner = classLabel
            predictedLabels.append(winner)
        for idx, value in enumerate(predictedLabels):
            position = pair[1][idx]
            final_labels[position] = value
        predictedLabels = np.array(predictedLabels)
        accuracy.append(
            metrics.accuracy_score(labels[pair[1]], predictedLabels))
        try:
            auc.append(
                metrics.roc_auc_score(
                    preprocessing.label_binarize(labels[pair[1]],
                                                 classes=list(
                                                     reversed(classes))),
                    preprocessing.label_binarize(predictedLabels,
                                                 classes=list(
                                                     reversed(classes)))))
        except:
            print("AUC cannot be calculated")
            auc.append(0)

    # accuracy = round(metrics.accuracy_score(labels, predictedLabels)*100, 1)
    # print metrics.accuracy_score(labels, model.cross_val_predict(classifier, matrix, labels, cv=10))
    # auc = metrics.roc_auc_score(preprocessing.label_binarize(labels, classes=list(reversed(classes))),
    #                             preprocessing.label_binarize(predictedLabels, classes=list(reversed(classes))))
    # auc = round(auc, 3)
    accuracy = np.array(accuracy)
    auc = np.array(auc)
    accuracy = round(accuracy.mean() * 100, 1)
    auc = round(auc.mean(), 3)
    print("Accuracy: %s\nAUC: %s" % (accuracy, auc))
    #print metrics.roc_auc_score(preprocessing.label_binarize(labels, classes=list(reversed(classes))),
    #                            preprocessing.label_binarize(model.cross_val_predict(classifier, matrix, labels, cv=10),
    #                                                         classes=list(reversed(classes))))
    final_labels = np.array(final_labels)
    newColumn = np.array([
        np.concatenate((np.array([relationName,
                                  "Guess"]), final_labels == labels,
                        np.array([accuracy]), np.array([auc])))
    ])
    resultMatrix = np.column_stack((resultMatrix, newColumn.transpose()))
    newColumn = np.array([
        np.concatenate((np.array(["",
                                  "Real Label"]), labels, np.array(["", ""])))
    ])
    resultMatrix = np.column_stack((resultMatrix, newColumn.transpose()))
    resultMatrix[resultMatrix == "True"] = "1"
    resultMatrix[resultMatrix == "False"] = "0"
    return resultMatrix
コード例 #7
0
ファイル: machine_learning.py プロジェクト: aascode/DDV
def complementarity_analysis(classifier,
                             databasesFolder=None,
                             modalityFiles=None,
                             exceptions=[],
                             folds=None,
                             showProba=None):

    if databasesFolder == None:
        databasesFolder = "datasets"
    if folds == None:
        folds = 10
    if showProba == None:
        showProba = False
    if len(exceptions) == 0:
        exceptions = ["early_fusion.arff", "syntax_informed.arff"]
    if modalityFiles == None:
        modalityFiles = sorted([
            os.path.join(databasesFolder, f)
            for f in os.listdir(databasesFolder)
            if os.path.isfile(os.path.join(databasesFolder, f))
            and not f.startswith('.') and f[-5:].lower() == ".arff"
            and not f in exceptions
        ],
                               key=lambda f: f.lower())
    else:
        modalityFiles = [
            os.path.join(databasesFolder, f) for f in modalityFiles
        ]
    try:
        with open(os.path.join(databasesFolder,
                               "list_of_instances.csv")) as listOfInstances:
            instanceNames = listOfInstances.readlines()
    except:
        print("There was an error reading the list of evaluated instances.")
        raise
    instanceNames = [name.strip() for name in instanceNames]
    if folds == len(instanceNames):
        from sklearn.model_selection import LeaveOneOut
        indices = [i for i in range(len(instanceNames))]
        folds = LeaveOneOut().split(indices)
    instanceNames += ["Accuracy", "AUC"]

    resultMatrix = np.array([[str(classifier).split("(")[0], ""] +
                             instanceNames]).transpose()
    for arffFile in modalityFiles:
        matrix, labels, relation, attributes = am.arff_to_nparray(arffFile)
        classes = list(set(labels))
        classes.sort()
        print("\n" + str(classifier).split("(")[0])
        print("Relation: " + relation)
        predictedLabels = model.cross_val_predict(
            classifier,
            matrix,
            labels,
            cv=folds,
            n_jobs=multiprocessing.cpu_count())
        if isinstance(folds, types.GeneratorType):
            folds = LeaveOneOut().split(indices)
        if showProba:
            try:
                probabilities = model.cross_val_predict(
                    classifier,
                    matrix,
                    labels,
                    method='predict_proba',
                    cv=folds,
                    n_jobs=multiprocessing.cpu_count())
                if isinstance(folds, types.GeneratorType):
                    folds = LeaveOneOut().split(indices)
            except:
                probabilities = np.array([[0, 0] for i in range(len(labels))])
                binary = preprocessing.label_binarize(predictedLabels,
                                                      classes=list(
                                                          reversed(classes)))
                for i in range(len(binary)):
                    if binary[i, 0] == 0:
                        tuple = [0, 1]
                    else:
                        tuple = [1, 0]
                    probabilities[i] = np.array(tuple)

        print(confusion_matrix(labels, predictedLabels))
        # accuracy = round(metrics.accuracy_score(labels, predictedLabels)*100, 1)
        accuracy = cross_val_score(classifier,
                                   matrix,
                                   labels,
                                   cv=folds,
                                   scoring="accuracy",
                                   n_jobs=multiprocessing.cpu_count())
        accuracy = round(accuracy.mean() * 100, 1)
        if isinstance(folds, types.GeneratorType):
            folds = LeaveOneOut().split(indices)
        #print metrics.accuracy_score(labels, model.cross_val_predict(classifier, matrix, labels, cv=10))
        #auc = metrics.roc_auc_score(preprocessing.label_binarize(labels, classes=list(reversed(classes))),
        #                            preprocessing.label_binarize(predictedLabels, classes=list(reversed(classes))))
        #auc = round(auc, 3)
        try:
            auc = cross_val_score(classifier,
                                  matrix,
                                  preprocessing.label_binarize(
                                      labels, classes),
                                  cv=folds,
                                  scoring="roc_auc",
                                  n_jobs=multiprocessing.cpu_count())
            auc = round(auc.mean(), 3)
        except:
            print("AUC cannot be calculated")
            auc = 0
        print("Accuracy: %s\nAUC: %s" % (accuracy, auc))
        if isinstance(folds, types.GeneratorType):
            folds = LeaveOneOut().split(indices)
        #print metrics.roc_auc_score(preprocessing.label_binarize(labels, classes=list(reversed(classes))),
        #                            preprocessing.label_binarize(model.cross_val_predict(classifier, matrix, labels, cv=10),
        #                                                         classes=list(reversed(classes))))
        if showProba:
            newColumn = np.concatenate((np.array([
                ["", ""],
                [classes[0] + " probability", classes[1] + " probability"]
            ]), probabilities, np.array([["", ""], ["", ""]])))
            resultMatrix = np.column_stack((resultMatrix, newColumn))
        newColumn = np.array([
            np.concatenate((np.array([relation,
                                      "Guess"]), predictedLabels == labels,
                            np.array([accuracy]), np.array([auc])))
        ])
        resultMatrix = np.column_stack((resultMatrix, newColumn.transpose()))
    newColumn = np.array([
        np.concatenate((np.array(["",
                                  "Real Label"]), labels, np.array(["", ""])))
    ])
    resultMatrix = np.column_stack((resultMatrix, newColumn.transpose()))
    resultMatrix[resultMatrix == "True"] = "1"
    resultMatrix[resultMatrix == "False"] = "0"
    return resultMatrix