def Classify(clf, featuresAll, bestParam):
    if clf == 'svm':
        model = aT.trainSVM(featuresAll, bestParam)
    elif clf == 'svm_rbf':
        model = aT.trainSVM_RBF(featuresAll, bestParam)
    elif clf == 'extratrees':
        model = aT.trainExtraTrees(featuresAll, bestParam)
    elif clf == 'randomforest':
        model = aT.trainRandomForest(featuresAll, bestParam)
    elif clf == 'knn':
        model = aT.trainKNN(featuresAll, bestParam)
    elif clf == 'gradientboosting':
        model = aT.trainGradientBoosting(featuresAll, bestParam)

    return model
Beispiel #2
0
 def getTrainClassifier(f_train,classifier_name,param):
     if classifier_name == AudioClassifierManager.__svmModelName:
         classifier = aT.trainSVM(f_train, param)
     elif classifier_name == AudioClassifierManager.__svmRbfModelName:
         classifier = aT.trainSVM_RBF(f_train, param)
     elif classifier_name == AudioClassifierManager.__knnModelName:
         classifier = aT.trainKNN(f_train, param)
     elif classifier_name == AudioClassifierManager.__randomforestModelName:
         classifier = aT.trainRandomForest(f_train, param)
     elif classifier_name == AudioClassifierManager.__gradientboostingModelName:
         classifier = aT.trainGradientBoosting(f_train, param)
     elif classifier_name == AudioClassifierManager.__extratreesModelName:
         classifier = aT.trainExtraTrees(f_train, param)
     else:
         classifier = None
     return classifier
def evaluateclassifier(features, class_names, n_exp, classifier_name, Params, parameterMode, perTrain=0.90):
    '''
    ARGUMENTS:
        features:     a list ([numOfClasses x 1]) whose elements containt numpy matrices of features.
                each matrix features[i] of class i is [n_samples x numOfDimensions]
        class_names:    list of class names (strings)
        n_exp:        number of cross-validation experiments
        classifier_name: svm or knn or randomforest
        Params:        list of classifier parameters (for parameter tuning during cross-validation)
        parameterMode:    0: choose parameters that lead to maximum overall classification ACCURACY
                1: choose parameters that lead to maximum overall f1 MEASURE
    RETURNS:
         bestParam:    the value of the input parameter that optimizes the selected performance measure
    '''

    # feature normalization:
    (features_norm, MEAN, STD) = normalizeFeatures(features)
    #features_norm = features;
    n_classes = len(features)
    ac_all = []
    f1_all = []
    precision_classes_all = []
    recall_classes_all = []
    f1_classes_all = []
    cms_all = []

    # compute total number of samples:
    n_samples_total = 0
    for f in features:
        n_samples_total += f.shape[0]
    if n_samples_total > 1000 and n_exp > 50:
        n_exp = 50
        print("Number of training experiments changed to 50 due to high number of samples")
    if n_samples_total > 2000 and n_exp > 10:
        n_exp = 10
        print("Number of training experiments changed to 10 due to high number of samples")

    for Ci, C in enumerate(Params):
        # for each param value
        cm = numpy.zeros((n_classes, n_classes))
        for e in range(n_exp):
            # for each cross-validation iteration:
            print("Param = {0:.5f} - classifier Evaluation "
                  "Experiment {1:d} of {2:d}".format(C, e+1, n_exp))
            # split features:
            f_train, f_test = randSplitFeatures(features_norm, perTrain)
            # train multi-class svms:
            if classifier_name == "svm":
                classifier = trainSVM(f_train, C)
            elif classifier_name == "svm_rbf":
                classifier = trainSVM_RBF(f_train, C)
            elif classifier_name == "knn":
                classifier = trainKNN(f_train, C)
            elif classifier_name == "randomforest":
                classifier = trainRandomForest(f_train, C)
            elif classifier_name == "gradientboosting":
                classifier = trainGradientBoosting(f_train, C)
            elif classifier_name == "extratrees":
                classifier = trainExtraTrees(f_train, C)
            elif classifier_name == "logisticregression":
                classifier = trainLogisticRegression(f_train, C)

            cmt = numpy.zeros((n_classes, n_classes))
            for c1 in range(n_classes):
                n_test_samples = len(f_test[c1])
                res = numpy.zeros((n_test_samples, 1))
                for ss in range(n_test_samples):
                    [res[ss], _] = classifierWrapperHead(classifier,
                                                     classifier_name,
                                                     f_test[c1][ss])
                for c2 in range(n_classes):
                    cmt[c1][c2] = float(len(numpy.nonzero(res == c2)[0]))
            cm = cm + cmt
        cm = cm + 0.0000000010
        rec = numpy.zeros((cm.shape[0], ))
        pre = numpy.zeros((cm.shape[0], ))

        for ci in range(cm.shape[0]):
            rec[ci] = cm[ci, ci] / numpy.sum(cm[ci, :])
            pre[ci] = cm[ci, ci] / numpy.sum(cm[:, ci])
        precision_classes_all.append(pre)
        recall_classes_all.append(rec)
        f1 = 2 * rec * pre / (rec + pre)
        f1_classes_all.append(f1)
        ac_all.append(numpy.sum(numpy.diagonal(cm)) / numpy.sum(cm))

        cms_all.append(cm)
        f1_all.append(numpy.mean(f1))

    print("\t\t", end="")
    for i, c in enumerate(class_names):
        if i == len(class_names)-1:
            print("{0:s}\t\t".format(c), end="")
        else:
            print("{0:s}\t\t\t".format(c), end="")
    print("OVERALL")
    print("\tC", end="")
    for c in class_names:
        print("\tPRE\tREC\tf1", end="")
    print("\t{0:s}\t{1:s}".format("ACC", "f1"))
    best_ac_ind = numpy.argmax(ac_all)
    best_f1_ind = numpy.argmax(f1_all)
    for i in range(len(precision_classes_all)):
        print("\t{0:.3f}".format(Params[i]), end="")
        for c in range(len(precision_classes_all[i])):
            print("\t{0:.1f}\t{1:.1f}\t{2:.1f}".format(100.0 * precision_classes_all[i][c],
                                                       100.0 * recall_classes_all[i][c],
                                                       100.0 * f1_classes_all[i][c]), end="")
        print("\t{0:.1f}\t{1:.1f}".format(100.0 * ac_all[i], 100.0 * f1_all[i]), end="")
        if i == best_f1_ind:
            print("\t best f1", end="")
        if i == best_ac_ind:
            print("\t best Acc", end="")
        print("")

    if parameterMode == 0:    # keep parameters that maximize overall classification accuracy:
        print("Confusion Matrix:")
        printConfusionMatrix(cms_all[best_ac_ind], class_names)
        return Params[best_ac_ind]
    elif parameterMode == 1:  # keep parameters that maximize overall f1 measure:
        print("Confusion Matrix:")
        printConfusionMatrix(cms_all[best_f1_ind], class_names)
        return Params[best_f1_ind]
def featureAndTrain(list_of_dirs, mt_win, mt_step, st_win, st_step,
                    classifier_type, model_name,
                    compute_beat=False, perTrain=0.90, feats=["gfcc", "mfcc"]):
    '''
    This function is used as a wrapper to segment-based audio feature extraction and classifier training.
    ARGUMENTS:
        list_of_dirs:        list of paths of directories. Each directory contains a signle audio class whose samples are stored in seperate WAV files.
        mt_win, mt_step:        mid-term window length and step
        st_win, st_step:        short-term window and step
        classifier_type:        "svm" or "knn" or "randomforest" or "gradientboosting" or "extratrees"
        model_name:        name of the model to be saved
    RETURNS:
        None. Resulting classifier along with the respective model parameters are saved on files.
    '''

    # STEP A: Feature Extraction:
    [features, classNames, _] = aF.dirsWavFeatureExtraction(list_of_dirs,
                                                            mt_win,
                                                            mt_step,
                                                            st_win,
                                                            st_step,
                                                            compute_beat=compute_beat,
                                                            feats=feats)

    if len(features) == 0:
        print("trainSVM_feature ERROR: No data found in any input folder!")
        return

    n_feats = features[0].shape[1]
    feature_names = ["features" + str(d + 1) for d in range(n_feats)]

    writeTrainDataToARFF(model_name, features, classNames, feature_names)

    for i, f in enumerate(features):
        if len(f) == 0:
            print("trainSVM_feature ERROR: " + list_of_dirs[i] + " folder is empty or non-existing!")
            return

    # STEP B: classifier Evaluation and Parameter Selection:
    if classifier_type == "svm" or classifier_type == "svm_rbf":
        classifier_par = numpy.array([0.001, 0.01,  0.5, 1.0, 5.0, 10.0, 20.0])
    elif classifier_type == "randomforest":
        classifier_par = numpy.array([10, 25, 50, 100,200,500])
    elif classifier_type == "knn":
        classifier_par = numpy.array([1, 3, 5, 7, 9, 11, 13, 15])
    elif classifier_type == "gradientboosting":
        classifier_par = numpy.array([10, 25, 50, 100,200,500])
    elif classifier_type == "extratrees":
        classifier_par = numpy.array([10, 25, 50, 100,200,500])
    elif classifier_type == "logisticregression":
        classifier_par = numpy.array([0.01, 0.1, 1, 5])

    # get optimal classifeir parameter:
    features2 = []
    for f in features:
        fTemp = []
        for i in range(f.shape[0]):
            temp = f[i,:]
            if (not numpy.isnan(temp).any()) and (not numpy.isinf(temp).any()) :
                fTemp.append(temp.tolist())
            else:
                print("NaN Found! Feature vector not used for training")
        features2.append(numpy.array(fTemp))
    features = features2

    bestParam = evaluateclassifier(features, classNames, 300, classifier_type, classifier_par, 0, perTrain) # Hier!!!!

    print("Selected params: {0:.5f}".format(bestParam))

    C = len(classNames)
    [features_norm, MEAN, STD] = normalizeFeatures(features)        # normalize features
    MEAN = MEAN.tolist()
    STD = STD.tolist()
    featuresNew = features_norm

    # STEP C: Save the classifier to file
    if classifier_type == "svm":
        classifier = trainSVM(featuresNew, bestParam)
    elif classifier_type == "svm_rbf":
        classifier = trainSVM_RBF(featuresNew, bestParam)
    elif classifier_type == "randomforest":
        classifier = trainRandomForest(featuresNew, bestParam)
    elif classifier_type == "gradientboosting":
        classifier = trainGradientBoosting(featuresNew, bestParam)
    elif classifier_type == "extratrees":
        classifier = trainExtraTrees(featuresNew, bestParam)
    elif classifier_type == "logisticregression":
        classifier = trainLogisticRegression(featuresNew, bestParam)


    if classifier_type == "knn":
        [X, Y] = listOfFeatures2Matrix(featuresNew)
        X = X.tolist()
        Y = Y.tolist()
        fo = open(model_name, "wb")
        cPickle.dump(X, fo, protocol=cPickle.HIGHEST_PROTOCOL)
        cPickle.dump(Y,  fo, protocol=cPickle.HIGHEST_PROTOCOL)
        cPickle.dump(MEAN, fo, protocol=cPickle.HIGHEST_PROTOCOL)
        cPickle.dump(STD,  fo, protocol=cPickle.HIGHEST_PROTOCOL)
        cPickle.dump(classNames,  fo, protocol=cPickle.HIGHEST_PROTOCOL)
        cPickle.dump(bestParam,  fo, protocol=cPickle.HIGHEST_PROTOCOL)
        cPickle.dump(mt_win, fo, protocol=cPickle.HIGHEST_PROTOCOL)
        cPickle.dump(mt_step, fo, protocol=cPickle.HIGHEST_PROTOCOL)
        cPickle.dump(st_win, fo, protocol=cPickle.HIGHEST_PROTOCOL)
        cPickle.dump(st_step, fo, protocol=cPickle.HIGHEST_PROTOCOL)
        cPickle.dump(compute_beat, fo, protocol=cPickle.HIGHEST_PROTOCOL)
        fo.close()
    elif classifier_type == "svm" or classifier_type == "svm_rbf" or \
                    classifier_type == "randomforest" or \
                    classifier_type == "gradientboosting" or \
                    classifier_type == "extratrees" or \
                    classifier_type == "logisticregression":
        with open(model_name, 'wb') as fid:
            cPickle.dump(classifier, fid)
        fo = open(model_name + "MEANS", "wb")
        cPickle.dump(MEAN, fo, protocol=cPickle.HIGHEST_PROTOCOL)
        cPickle.dump(STD, fo, protocol=cPickle.HIGHEST_PROTOCOL)
        cPickle.dump(classNames, fo, protocol=cPickle.HIGHEST_PROTOCOL)
        cPickle.dump(mt_win, fo, protocol=cPickle.HIGHEST_PROTOCOL)
        cPickle.dump(mt_step, fo, protocol=cPickle.HIGHEST_PROTOCOL)
        cPickle.dump(st_win, fo, protocol=cPickle.HIGHEST_PROTOCOL)
        cPickle.dump(st_step, fo, protocol=cPickle.HIGHEST_PROTOCOL)
        cPickle.dump(compute_beat, fo, protocol=cPickle.HIGHEST_PROTOCOL)
        fo.close()
def main(rootName, modelType, classifierParam, signal_type):
    CMall = numpy.zeros((2, 2))
    if modelType != "svm" and modelType != "svm_rbf":
        C = [int(classifierParam)]
    else:
        C = [(classifierParam)]
    F1s = []
    Accs = []
    for ifold in range(0, 10):  # for each fold
        dirName = rootName + os.sep + "fold_{0:d}".format(
            ifold)  # get fold path name
        classNamesTrain, featuresTrain = dirFeatureExtraction([
            os.path.join(dirName, "train", "fail"),
            os.path.join(dirName, "train", "success")
        ], signal_type)  # TRAINING data feature extraction
        bestParam = aT.evaluateClassifier(
            featuresTrain, classNamesTrain, 2, modelType, C, 0,
            0.90)  # internal cross-validation (for param selection)
        classNamesTest, featuresTest = dirFeatureExtraction([
            os.path.join(dirName, "test", "fail"),
            os.path.join(dirName, "test", "success")
        ], signal_type)  # trainGradientBoosting data feature extraction
        [featuresTrainNew, MEAN, STD] = aT.normalizeFeatures(
            featuresTrain)  # training features NORMALIZATION
        if modelType == "svm":  # classifier training
            Classifier = aT.trainSVM(featuresTrainNew, bestParam)
        elif modelType == "svm_rbf":
            Classifier = aT.trainSVM_RBF(featuresTrainNew, bestParam)
        elif modelType == "randomforest":
            Classifier = aT.trainRandomForest(featuresTrainNew, bestParam)
        elif modelType == "gradientboosting":
            Classifier = aT.trainGradientBoosting(featuresTrainNew, bestParam)
        elif modelType == "extratrees":
            Classifier = aT.trainExtraTrees(featuresTrainNew, bestParam)

        CM = numpy.zeros((2, 2))  # evaluation on testing data
        for iC, f in enumerate(featuresTest):  # for each class
            for i in range(
                    f.shape[0]):  # for each testing sample (feature vector)
                curF = f[i, :]  # get feature vector
                curF = (curF - MEAN) / STD  # normalize test feature vector
                winnerClass = classNamesTrain[int(
                    aT.classifierWrapper(
                        Classifier, modelType,
                        curF)[0])]  # classify and get winner class
                trueClass = classNamesTest[iC]  # get groundtruth class
                CM[classNamesTrain.index(trueClass)][classNamesTrain.index(
                    winnerClass)] += 1  # update confusion matrix
        CMall += CM  # update overall confusion matrix
        Recall, Precision, F1 = computePreRec(
            CM, classNamesTrain)  # get recall, precision and F1 (per class)
        Acc = numpy.diagonal(CM).sum() / CM.sum()  # get overall accuracy
        F1s.append(numpy.mean(F1))  # append average F1
        Accs.append(Acc)  # append clasification accuracy
    print
    print "FINAL RESULTS"
    print
    print "----------------------------------"
    print "fold\tacc\tf1"
    print "----------------------------------"
    for i in range(len(F1s)):
        print "{0:d}\t{1:.1f}\t{2:.1f}".format(i, 100 * Accs[i], 100 * F1s[i])
    Acc = numpy.diagonal(CMall).sum() / CMall.sum()
    Recall, Precision, F1 = computePreRec(CMall, classNamesTrain)
    print "----------------------------------"
    print "{0:s}\t{1:.1f}\t{2:.1f}".format("Avg", 100 * numpy.mean(Accs),
                                           100 * numpy.mean(F1s))
    print "{0:s}\t{1:.1f}\t{2:.1f}".format("Av CM", 100 * Acc,
                                           100 * numpy.mean(F1))
    print "----------------------------------"
    print
    print "Overal Confusion matrix:"
    aT.printConfusionMatrix(CMall, classNamesTrain)
    print
    print "FAIL Recall = {0:.1f}".format(100 *
                                         Recall[classNamesTrain.index("fail")])
    print "FAIL Precision = {0:.1f}".format(
        100 * Precision[classNamesTrain.index("fail")])
    print "SUCCESS Recall = {0:.1f}".format(
        100 * Recall[classNamesTrain.index("success")])
    print "SUCCESS Precision = {0:.1f}".format(
        100 * Precision[classNamesTrain.index("success")])

    return CMall, Acc, Recall, Precision, F1
def trainTextClassifiers(directoryPath, classifierType, classifierName):
    subdirectories = get_immediate_subdirectories(directoryPath)
    #tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, max_features = 10000, stop_words='english')
    dicts = loadDictionaries("myDicts/")
    classNames = []
    Features = []
    # extract features from corpus
    for si, s in enumerate(
            subdirectories):  # for each directory in training data
        print "Training folder {0:d} of {1:d} ({2:s})".format(
            si + 1, len(subdirectories), s),
        files = getListOfFilesInDir(directoryPath + os.sep + s,
                                    "*")  # get list of files in directory
        if MAX_FILES_PER_CLASS > 0 and MAX_FILES_PER_CLASS < len(files):
            files = random.sample(files, MAX_FILES_PER_CLASS)
        print " - {0:d} files".format(len(files))
        classNames.append(s)
        for ifile, fi in enumerate(files):  # for each file in current class:
            with open(fi) as f:
                content = f.read()
                curF = getFeaturesFromText(content,
                                           dicts)  # get feature vector
            if ifile == 0:  # update feature matrix
                Features.append(curF.T)
            else:
                Features[-1] = numpy.concatenate((Features[-1], curF.T),
                                                 axis=0)

    # define classifier parameters
    if classifierType == "svm":
        classifierParams = numpy.array([0.001, 0.01, 0.5, 1.0, 5.0, 10.0])
    elif classifierType == "randomforest":
        classifierParams = numpy.array([10, 25, 50, 100, 200, 500])
    elif classifierType == "knn":
        classifierParams = numpy.array([1, 3, 5, 7, 9, 11, 13, 15])
    elif classifierType == "gradientboosting":
        classifierParams = numpy.array([10, 25, 50, 100, 200, 500])
    elif classifierType == "extratrees":
        classifierParams = numpy.array([10, 25, 50, 100, 200, 500])

    # evaluate classifier and select best param
    nExp = 10
    bestParam = audioTrainTest.evaluateClassifier(Features, subdirectories,
                                                  nExp, classifierType,
                                                  classifierParams, 0, 0.9)

    # normalize features
    C = len(classNames)
    [featuresNorm, MEAN, STD] = audioTrainTest.normalizeFeatures(Features)
    MEAN = MEAN.tolist()
    STD = STD.tolist()
    featuresNew = featuresNorm

    # save the classifier to file
    if classifierType == "svm":
        Classifier = audioTrainTest.trainSVM(featuresNew, bestParam)
    elif classifierType == "randomforest":
        Classifier = audioTrainTest.trainRandomForest(featuresNew, bestParam)
    elif classifierType == "gradientboosting":
        Classifier = audioTrainTest.trainGradientBoosting(
            featuresNew, bestParam)
    elif classifierType == "extratrees":
        Classifier = audioTrainTest.trainExtraTrees(featuresNew, bestParam)

    if 'Classifier' in locals():
        with open(classifierName, 'wb') as fid:  # save to file
            cPickle.dump(Classifier, fid)
        fo = open(classifierName + "MEANS", "wb")
        cPickle.dump(MEAN, fo, protocol=cPickle.HIGHEST_PROTOCOL)
        cPickle.dump(STD, fo, protocol=cPickle.HIGHEST_PROTOCOL)
        cPickle.dump(classNames, fo, protocol=cPickle.HIGHEST_PROTOCOL)
        fo.close()