Example #1
0
def main():
    # input arguments
    input_args = dict()
    input_args['positive_class_label'] = 'FS'
    input_args['train_ratio'] = 0.75
    input_args['cv_splits'] = 10  #5
    input_args['cv_repeats'] = 1
    input_args['rebalance_classes'] = True
    #input_args['data_file']= r'C:\Users\xavier.mouy\Documents\PhD\Projects\Detector\results\dataset_FS-NN_modified_20201105145300.nc'
    input_args[
        'data_file'] = r'C:\Users\xavier.mouy\Documents\PhD\Projects\Detector\results\dataset_FS-NN_modified_20200902194334.nc'
    input_args[
        'out_dir'] = r'C:\Users\xavier.mouy\Documents\PhD\Projects\Detector\results\Classification'
    input_args['run_CV'] = False
    input_args['train_final_model'] = True
    input_args['final_model_name'] = 'RF50'

    ## DEFINITION OF CLASSIFIERS -------------------------------------------------
    models = []
    models.append(('Dummy', DummyClassifier(strategy="constant", constant=1)))
    models.append(
        ('LR', LogisticRegression(solver='liblinear', multi_class='ovr')))
    models.append(('LDA', LinearDiscriminantAnalysis()))
    #models.append(('KNN', KNeighborsClassifier()))
    #models.append(('KNN', KNeighborsClassifier(n_neighbors=4, metric='euclidean')))
    models.append(('CART', DecisionTreeClassifier()))
    #models.append(('NB', GaussianNB()))
    models.append(('XGBoost', XGBClassifier()))
    #models.append(('MLP', MLPClassifier(solver='sgd', alpha=1e-5, hidden_layer_sizes=(5, 2), random_state=0)))
    models.append(('RF5',
                   RandomForestClassifier(n_estimators=5,
                                          min_samples_split=100,
                                          min_samples_leaf=50,
                                          random_state=0)))
    models.append(('RF10',
                   RandomForestClassifier(n_estimators=10,
                                          min_samples_split=100,
                                          min_samples_leaf=50,
                                          random_state=0)))
    models.append(('RF30',
                   RandomForestClassifier(n_estimators=30,
                                          min_samples_split=100,
                                          min_samples_leaf=50,
                                          random_state=0)))
    models.append(('RF50',
                   RandomForestClassifier(n_estimators=50,
                                          min_samples_split=100,
                                          min_samples_leaf=50,
                                          random_state=0)))
    #models.append(('RF100', RandomForestClassifier(n_estimators=100,min_samples_split= 100, min_samples_leaf=50,random_state=0)))

    ## setup output folder
    now = datetime.now()
    now_str = now.strftime("%Y%m%dT%H%M%S")
    out_dir = os.path.join(input_args['out_dir'], now_str)
    os.mkdir(out_dir)

    ## Save input args to txt file
    text_file = open(os.path.join(out_dir, 'input_args_' + now_str + '.txt'),
                     "w")
    n = text_file.write(str(input_args))
    text_file.close()

    ## Checks that model name exists before running all the processing
    if input_args['train_final_model']:
        model_idx = [model[0]
                     for model in models].index(input_args['final_model_name'])

    ## LOAD DATSET ---------------------------------------------------------------
    dataset = Measurement()
    dataset.from_netcdf(input_args['data_file'])
    print(dataset.summary())

    ## DATA PREPARATION ----------------------------------------------------------
    # features
    features = dataset.metadata['measurements_name'][
        0]  # list of features used for the classification
    # data
    data = dataset.data
    # drop FS observations at Mill Bay
    indexNames = data[(data['label_class'] == 'FS')
                      & (data['location_name'] == 'Mill bay')].index
    data.drop(indexNames, inplace=True)
    # add subclass + IDs
    data, class_encoder = add_class_ID(data,
                                       input_args['positive_class_label'])
    data, _ = add_subclass(data)
    #subclass2class_table = subclass2class_conversion(data)
    # add group ID
    data, group_encoder = add_group(data)

    ## DATA CLEAN-UP -------------------------------------------------------------
    # Basic stats on all features
    data_stats = data[features].describe()
    #print(data_stats)

    # how many NaNs and Infs per column
    data = data.replace([np.inf, -np.inf], np.nan)
    Nnan = data[features].isna().sum()
    ax = Nnan.plot(kind='bar', title='Number of NaN/Inf', grid=True)
    ax.set_ylabel('Number of observations with NaNs/Infs')

    # Drop some features with too many NaNs
    features.remove('freq_flatness')
    features.remove('snr')
    features.remove('uuid')

    # drop observations/rows with NaNs
    data.dropna(subset=features, axis=0, how='any', thresh=None, inplace=True)
    data_stats2 = data[features].describe()

    # ## VISUALIZATION -------------------------------------------------------------
    # # box and whisker plots
    # data[features].plot(kind='box', subplots=True, layout=(7,7), sharex=False, sharey=False)
    # # histograms
    # data[features].hist()
    # # scatter plot matrix
    # pd.plotting.scatter_matrix(data[features])
    # scatter plot PCA
    # pca = PCA(n_components=2)
    # X  = pca.fit_transform(data[features])
    # y = data['class_ID']
    # plot_2d_space(X, y, 'Imbalanced dataset (2 PCA components)')

    ## SPLIT DATA INTO TRAIN & TEST SETS ------------------------------------------
    n_splits = round(1 / (1 - input_args['train_ratio']))
    skf = StratifiedGroupKFold(n_splits=n_splits,
                               shuffle=True,
                               random_state=None)
    for train_index, test_index in skf.split(data,
                                             data['subclass_ID'],
                                             groups=data['group_ID']):
        data_train, data_test = data.iloc[train_index], data.iloc[test_index]
        break
    # plot class repartition
    plot_datasets_distrib(data_train, data_test)
    plot_dataset_distrib(data,
                         attr_list=['subclass_label', 'label_class'],
                         title='Full dataset')
    plot_dataset_distrib(data_train,
                         attr_list=['subclass_label', 'label_class'],
                         title='Training set')
    plot_dataset_distrib(data_test,
                         attr_list=['subclass_label', 'label_class'],
                         title='Test set')
    # verify groups are not used in both datasets
    groups_intersection = plot_datasets_groups(data_train,
                                               data_test,
                                               show=True)

    ## CROSS VALIDATION ON TRAIN SET ----------------------------------------------
    if input_args['run_CV']:
        # run train/test experiments
        cv_predictions, cv_performance = cross_validation(
            data_train,
            models,
            features,
            cv_splits=input_args['cv_splits'],
            cv_repeats=input_args['cv_repeats'],
            rebalance=input_args['rebalance_classes'])
        # display summary results
        performance_report = summarize_performance(cv_performance,
                                                   threshold=0.5)
        print(performance_report)
        # plot mean Precision and Recall curves
        plot_PR_curves(cv_performance)
        plot_F_curves(cv_performance)
        # save results
        CV_results = {
            'cv_predictions': cv_predictions,
            'cv_performance': cv_performance,
            'models': models,
            'input_args': input_args,
        }
        pickle.dump(
            CV_results,
            open(os.path.join(out_dir, 'CV_' + now_str + '.sav'), 'wb'))

    ## FINAL EVALUATION ON TEST SET -----------------------------------------------
    if input_args['train_final_model']:

        print(' ')
        print('Final evaluation on test set:')
        print(' ')

        model_name = models[model_idx][0]
        model = models[model_idx][1]  # RF50
        print(model)
        X_train = data_train[features]  # features
        Y_train = data_train['class_ID']  #labels
        X_test = data_test[features]  # features
        Y_test = data_test['class_ID']  #labels
        # feature normalization
        Norm_mean = X_train.mean()
        Norm_std = X_train.std()
        X_train = (X_train - Norm_mean) / Norm_std
        X_test = (X_test - Norm_mean) / Norm_std
        # Train on entire train set
        final_model = classification_train(
            X_train, Y_train, model, rebalance=input_args['rebalance_classes'])
        # Evaluate on full test set
        pred_class, pred_prob = classification_predict(X_test, final_model)
        # Print evaluation report
        CR = classification_report(Y_test, pred_class)
        print(CR)
        # save the model to disk
        model = {
            'name': model_name,
            'model': final_model,
            'features': features,
            'normalization_mean': Norm_mean,
            'normalization_std': Norm_std,
            'classes': class_encoder,
            'input_args': input_args,
        }
        pickle.dump(
            model,
            open(
                os.path.join(out_dir,
                             model_name + '_model_' + now_str + '.sav'), 'wb'))
"""

# Define input and output files
annot_file = r'C:\Users\xavier.mouy\Documents\PhD\Projects\Dectector\results\dataset_annotations_only.nc'
noise_file = r'C:\Users\xavier.mouy\Documents\PhD\Projects\Dectector\results\Noise_dataset'
outfile=r'C:\Users\xavier.mouy\Documents\PhD\Projects\Dectector\results\dataset_FS-NN_modified_20201105145300.nc'

# Load measurements
meas_annot = Measurement()
meas_annot.from_netcdf(annot_file)
meas_noise = Measurement()
meas_noise.from_netcdf(noise_file)

## Label noise measurement as 'NN'
meas_noise.insert_values(label_class='NN')
print(meas_noise.summary())

## relabel annotations that are not 'FS' as 'NN'
print(meas_annot.summary())
meas_annot.data['label_class'].replace(to_replace=['', 'ANT','HS','KW','UN'], value='NN', inplace=True)
print(meas_annot.summary())

## merge the 2 datasets
meas_NN_FS = meas_noise + meas_annot
print(meas_NN_FS.summary())

## Save dataset to nc file
meas_NN_FS.to_netcdf(outfile)

# print('-----------------')
# print('  Annotations    ')
# annot = Annotation()
# annot.from_netcdf(annotation_file)
# print(annot.summary())
# annot_perfile = annot.summary(rows='audio_file_name',columns='label_class')
# annot_perfile.rename(columns={"FS": "FS-annot"}, inplace=True)
# annot_perfile = annot_perfile['FS-annot'].to_frame()
# #annot_perfile.to_csv('annot.csv')

print(' ')
print('-----------------')
print('  Detections     ')
# load detections
detec = Measurement()
detec.from_netcdf(detec_file)
print(detec.summary())
detec_perfile = detec.summary(rows='audio_file_name', columns='label_class')
detec_perfile.rename(columns={"FS": "FS-detec"}, inplace=True)
detec_perfile = detec_perfile['FS-detec'].to_frame()

dd = pd.concat([annot_perfile, detec_perfile], axis=1)
dd['diff'] = dd['FS-annot'] - dd['FS-detec']
dd.plot()

# outdir=r'C:\Users\xavier.mouy\Documents\Workspace\GitHub\ecosound\tests\detec_export'
# detec.to_pamlab(outdir, single_file=False)

# outdir=r'C:\Users\xavier.mouy\Documents\Workspace\GitHub\ecosound\tests\annot_export'
# annot.to_pamlab(outdir, single_file=False)