Exemple #1
0
def testClassifiers(dir_clf, dir_test, species, feature, clf=None, pca=False):
    '''
    Load previously trained classifiers and test on a completely new data set.
    :param dir_clf: path to the saved classifiers
    :param dir_test: path to the test dataset
    :param species: species name
    :param feature: 'WEraw_all', 'WEraw_band', 'WEraw_spnodes' ...
    :param clf: classifier name e.g. 'SVM'

    :return: print out confusion matrix
    '''
    # read test dataset
    d = pd.read_csv(os.path.join(dir_test, species + '_' + feature + '.tsv'),
                    sep="\t",
                    header=None)
    # d = pd.read_csv(os.path.join(dir_test, 'Kiwi (Tokoeka Fiordland)_WE_spnodes_seg_test.tsv'), sep=",", header=None)
    data = d.values
    targets = data[:, -1]
    data = data[:, 0:-1]
    # use PCA if selected
    if pca:
        pca1 = PCA(n_components=0.8)  # will retain 90% of the variance
        data = pca1.fit_transform(data)
    # Test with all 62 nodes
    learners = Learning(data, targets,
                        testFraction=1)  # use all data for testing
    # # OR test with optimum nodes, for kiwi it is [35, 43, 36, 45]
    # # kiwiNodes = [35, 43, 36, 45]
    # kiwiNodes = [34, 35, 36, 37, 38, 41, 42, 43, 44, 45, 46, 55]
    # kiwiNodes = [n - 1 for n in kiwiNodes]
    # nodes = list(range(63))
    # nonKiwiNodes = list(set(nodes) - set(kiwiNodes))
    # learners = Learning(data[:, kiwiNodes], data[:, -1], testFraction=1)
    # # learners = Learning.Learning(data[:, nonKiwiNodes], data[:, -1])
    # # learners = Learning.Learning(data[:, 33:61], data[:, -1])
    if clf == None:
        print("MLP--------------------------------")
        # Load the model
        model = load(
            os.path.join(dir_clf, species + '_' + feature + '_MLP.joblib'))
        learners.performTest(model)
        print("kNN--------------------------------")
        model = load(
            os.path.join(dir_clf, species + '_' + feature + '_kNN.joblib'))
        learners.performTest(model)
        print("SVM--------------------------------")
        model = load(
            os.path.join(dir_clf, species + '_' + feature + '_SVM.joblib'))
        learners.performTest(model)
        print("GP--------------------------------")
        model = load(
            os.path.join(dir_clf, species + '_' + feature + '_GP.joblib'))
        learners.performTest(model)
        print("DT--------------------------------")
        model = load(
            os.path.join(dir_clf, species + '_' + feature + '_DT.joblib'))
        learners.performTest(model)
        print("RF--------------------------------")
        model = load(
            os.path.join(dir_clf, species + '_' + feature + '_RF.joblib'))
        learners.performTest(model)
        print("Boosting--------------------------------")
        model = load(
            os.path.join(dir_clf, species + '_' + feature + '_Boost.joblib'))
        learners.performTest(model)
        print("XGB--------------------------------")
        model = load(
            os.path.join(dir_clf, species + '_' + feature + '_XGB.joblib'))
        learners.performTest(model)
        # print("GMM--------------------------------")
        # model = load(os.path.join(dir_clf, species + '_' + feature + '_GMM.joblib'))
        # learners.performTest(model)
        print("######################################################")
    else:
        model = load(
            os.path.join(dir_clf,
                         species + '_' + feature + '_' + clf + '.joblib'))
        learners.performTest(model)
Exemple #2
0
def TrainClassifier(dir, species, feature, clf=None, pca=False):
    '''
    Use wavelet energy/MFCC as features, train, and save the classifiers for later use
    Recommended to use fit_GridSearchCV and plot validation/learning curves to determine hyper-parameter values
    and see how learning improves with more data, at what point it gets stable
    Choose what features to show to the classifier. Currently lots of variations of WE and MFCC.
    (1) Wavelet Energies - All 62 nodes, extracted from raw recordings (feature = 'weraw_all')
    (2) Wavelet Energies - Limit nodes to match frequency range of the species, extracted from raw recordings
    (3) Wavelet Energies - Limit to optimum nodes for species, extracted from raw recordings

    (4) Wavelet Energies - All 62 nodes, extracted with bandpass filter
    (5) Wavelet Energies - Limit nodes to match frequency range of the species, extracted with bandpass filter
    (6) Wavelet Energies - Limit to optimum nodes for species, extracted with bandpass filter

    (7) Wavelet Energies - All 62 nodes, extracted from denoised
    (8) Wavelet Energies - Limit nodes to match frequency range of the species, extracted from denoised
    (9) Wavelet Energies - Limit to optimum nodes for species, extracted from denoised

    (10) Wavelet Energies - All 62 nodes, extracted from denoised + bandpassed
    (11) Wavelet Energies - Limit nodes to match frequency range of the species, extracted from denoised + bandpassed
    (12) Wavelet Energies - Limit to optimum nodes for species, extracted from denoised + bandpassed

    (13) MFCC - Full range extracted from raw ('mfccraw_all')
    (14) MFCC - Limit to match frquency range of the species extracted from raw ('mfccraw_band')
    (15) MFCC - Full range extracted from bandpassed ('mfccbp_all')
    (16) MFCC - Limit to match frquency range of the species extracted from bandpassed
    (17) MFCC - Full range extracted from denoised
    (18) MFCC - Limit to match frquency range of the species extracted from denoised
    (19) MFCC - Full range extracted from bandpassed + denoised
    (20) MFCC - Limit to match frquency range of the species extracted from bandpassed + denoised

    :param dir: path to the dataset
    :param species: species name so that the classifier can be saved accordingly
    :param feature: 'WEraw_all', 'WEraw_band', 'WEraw_spnodes',
                    'WEbp_all', 'WEbp_band', 'WEbp_spnodes',
                    'WEd_all', 'WEd_band', 'WEd_spnodes',
                    'WEbpd_all', 'WEbpd_band', 'WEbpd_spnodes',
                    'MFCCraw_all', 'mfccraw_band',
                    'MFCCbp_all', 'mfccbp_band',
                    'MFCCd_all', 'MFCCd_band',
                    'MFCCbpd_all', 'MFCCbpd_band'
    :param clf: name of the classifier to train
    :return: save the trained classifier in dirName e.g. kiwi_SVM.joblib
    '''
    # Read previously stored data as required
    # d = pd.read_csv(os.path.join(dir, 'Kiwi (Tokoeka Fiordland)_WE_spnodes_seg_train.tsv'), sep=",", header=None)
    d = pd.read_csv(os.path.join(dir, species + '_' + feature + '.tsv'),
                    sep="\t",
                    header=None)
    data = d.values

    # Balance the data set
    targets = data[:, -1]
    data = data[:, 0:-1]
    posTargetInd = np.where(targets == 1)
    negTargetInd = np.where(targets == 0)
    # randomly select n negative rows
    n = min(np.shape(posTargetInd)[1], np.shape(negTargetInd)[1])
    posTargetInd = posTargetInd[0].tolist()
    posTargetInd = random.sample(posTargetInd, n)
    negTargetInd = negTargetInd[0].tolist()
    negTargetInd = random.sample(negTargetInd, n)
    inds = posTargetInd + negTargetInd
    data = data[inds, :]
    # use PCA if selected
    if pca:
        pca1 = PCA(n_components=0.8)  # will retain 90% of the variance
        data = pca1.fit_transform(data)
    targets = targets[inds]

    learners = Learning(data, targets,
                        testFraction=0.5)  # use whole data set for training
    # OR learn with optimum nodes, for kiwi it is [35, 43, 36, 45]
    # kiwiNodes = [35, 43, 36, 45]
    # kiwiNodes = [34, 35, 36, 37, 38, 41, 42, 43, 44, 45, 46, 55]
    # kiwiNodes = [n - 1 for n in kiwiNodes]
    # nodes = list(range(63))
    # # nonKiwiNodes = list(set(nodes) - set(kiwiNodes))
    # # print(nonKiwiNodes)
    # learners = Learning(data[:, kiwiNodes], targets)
    # learners = Learning(data[:, nonKiwiNodes], data[:, -1])
    # learners = Learning(data[:, 33:61], data[:, -1])

    if clf == None:  # then train all the classifiers (expensive option)
        print("MLP--------------------------------")
        # model = learners.trainMLP(structure=(100,), learningrate=0.001, solver='adam', epochs=200, alpha=1,
        #                           shuffle=True, early_stopping=False)
        model = learners.trainMLP(structure=(25, ),
                                  learningrate=0.001,
                                  solver='adam',
                                  epochs=200,
                                  alpha=1,
                                  shuffle=True,
                                  early_stopping=False)
        # Save the model
        dump(model, os.path.join(dir, species + '_' + feature + '_MLP.joblib'))
        learners.performTest(model)
        print("kNN--------------------------------")
        model = learners.trainKNN(K=3)
        # Save the model
        dump(model, os.path.join(dir, species + '_' + feature + '_kNN.joblib'))
        learners.performTest(model)
        print("SVM--------------------------------")
        # model = learners.trainSVM(kernel="rbf", C=1, gamma=0.0077)
        model = learners.trainSVM(kernel="rbf", C=1, gamma=0.03)
        learners.performTest(model)
        # Save the model
        dump(model, os.path.join(dir, species + '_' + feature + '_SVM.joblib'))
        learners.performTest(model)
        print("GP--------------------------------")
        model = learners.trainGP()
        # Save the model
        dump(model, os.path.join(dir, species + '_' + feature + '_GP.joblib'))
        learners.performTest(model)
        print("DT--------------------------------")
        model = learners.trainDecisionTree()
        # Save the model
        dump(model, os.path.join(dir, species + '_' + feature + '_DT.joblib'))
        learners.performTest(model)
        print("RF--------------------------------")
        model = learners.trainRandomForest()
        # Save the model
        dump(model, os.path.join(dir, species + '_' + feature + '_RF.joblib'))
        learners.performTest(model)
        print("Boosting--------------------------------")
        model = learners.trainBoosting()
        # Save the model
        dump(model, os.path.join(dir,
                                 species + '_' + feature + '_Boost.joblib'))
        learners.performTest(model)
        print("XGB--------------------------------")
        model = learners.trainXGBoost()
        # Save the model
        dump(model, os.path.join(dir, species + '_' + feature + '_XGB.joblib'))
        learners.performTest(model)
        # print("GMM--------------------------------")
        # model = learners.trainGMM(covType='full', maxIts=200, nClasses=4)
        # # Save the model
        # dump(model, os.path.join(dir,species+'_'+feature+'_GMM.joblib'))
        print("######################################################")
    elif clf == 'MLP':
        print("MLP--------------------------------")
        model = learners.trainMLP(structure=(250, ),
                                  learningrate=0.001,
                                  solver='adam',
                                  epochs=200,
                                  alpha=1,
                                  shuffle=True,
                                  early_stopping=True)
        # Save the model
        dump(model, os.path.join(dir, species + '_' + feature + '_MLP.joblib'))
    elif clf == 'kNN':
        print("kNN--------------------------------")
        model = learners.trainKNN(K=3)
        # Save the model
        dump(model, os.path.join(dir, species + '_' + feature + '_kNN.joblib'))
    elif clf == 'SVM':
        print("SVM--------------------------------")
        model = learners.trainSVM(kernel="rbf", C=1, gamma=0.00018)
        # Save the model
        dump(model, os.path.join(dir, species + '_' + feature + '_SVM.joblib'))
    elif clf == 'GP':
        print("GP--------------------------------")
        model = learners.trainGP()
        # Save the model
        dump(model, os.path.join(dir, species + '_' + feature + '_GP.joblib'))
    elif clf == 'DT':
        print("DT--------------------------------")
        model = learners.trainDecisionTree()
        # Save the model
        dump(model, os.path.join(dir, species + '_' + feature + '_DT.joblib'))
    elif clf == 'RF':
        print("RF--------------------------------")
        model = learners.trainRandomForest()
        # Save the model
        dump(model, os.path.join(dir, species + '_' + feature + '_RF.joblib'))
    elif clf == 'Boost':
        print("Boosting--------------------------------")
        model = learners.trainBoosting()
        # Save the model
        dump(model, os.path.join(dir,
                                 species + '_' + feature + '_Boost.joblib'))
    elif clf == 'XGB':
        print("XGB--------------------------------")
        model = learners.trainXGBoost()
        # Save the model
        dump(model, os.path.join(dir, species + '_' + feature + '_XGB.joblib'))
    elif clf == 'GMM':
        print("GMM--------------------------------")
        model = learners.trainGMM(covType='full', maxIts=200, nClasses=4)
        # Save the model
        dump(model, os.path.join(dir, species + '_' + feature + '_GMM.joblib'))