def main(): # input arguments input_args = dict() input_args['positive_class_label'] = 'FS' input_args['train_ratio'] = 0.75 input_args['cv_splits'] = 10 #5 input_args['cv_repeats'] = 1 input_args['rebalance_classes'] = True #input_args['data_file']= r'C:\Users\xavier.mouy\Documents\PhD\Projects\Detector\results\dataset_FS-NN_modified_20201105145300.nc' input_args[ 'data_file'] = r'C:\Users\xavier.mouy\Documents\PhD\Projects\Detector\results\dataset_FS-NN_modified_20200902194334.nc' input_args[ 'out_dir'] = r'C:\Users\xavier.mouy\Documents\PhD\Projects\Detector\results\Classification' input_args['run_CV'] = False input_args['train_final_model'] = True input_args['final_model_name'] = 'RF50' ## DEFINITION OF CLASSIFIERS ------------------------------------------------- models = [] models.append(('Dummy', DummyClassifier(strategy="constant", constant=1))) models.append( ('LR', LogisticRegression(solver='liblinear', multi_class='ovr'))) models.append(('LDA', LinearDiscriminantAnalysis())) #models.append(('KNN', KNeighborsClassifier())) #models.append(('KNN', KNeighborsClassifier(n_neighbors=4, metric='euclidean'))) models.append(('CART', DecisionTreeClassifier())) #models.append(('NB', GaussianNB())) models.append(('XGBoost', XGBClassifier())) #models.append(('MLP', MLPClassifier(solver='sgd', alpha=1e-5, hidden_layer_sizes=(5, 2), random_state=0))) models.append(('RF5', RandomForestClassifier(n_estimators=5, min_samples_split=100, min_samples_leaf=50, random_state=0))) models.append(('RF10', RandomForestClassifier(n_estimators=10, min_samples_split=100, min_samples_leaf=50, random_state=0))) models.append(('RF30', RandomForestClassifier(n_estimators=30, min_samples_split=100, min_samples_leaf=50, random_state=0))) models.append(('RF50', RandomForestClassifier(n_estimators=50, min_samples_split=100, min_samples_leaf=50, random_state=0))) #models.append(('RF100', RandomForestClassifier(n_estimators=100,min_samples_split= 100, min_samples_leaf=50,random_state=0))) ## setup output folder now = datetime.now() now_str = now.strftime("%Y%m%dT%H%M%S") out_dir = os.path.join(input_args['out_dir'], now_str) os.mkdir(out_dir) ## Save input args to txt file text_file = open(os.path.join(out_dir, 'input_args_' + now_str + '.txt'), "w") n = text_file.write(str(input_args)) text_file.close() ## Checks that model name exists before running all the processing if input_args['train_final_model']: model_idx = [model[0] for model in models].index(input_args['final_model_name']) ## LOAD DATSET --------------------------------------------------------------- dataset = Measurement() dataset.from_netcdf(input_args['data_file']) print(dataset.summary()) ## DATA PREPARATION ---------------------------------------------------------- # features features = dataset.metadata['measurements_name'][ 0] # list of features used for the classification # data data = dataset.data # drop FS observations at Mill Bay indexNames = data[(data['label_class'] == 'FS') & (data['location_name'] == 'Mill bay')].index data.drop(indexNames, inplace=True) # add subclass + IDs data, class_encoder = add_class_ID(data, input_args['positive_class_label']) data, _ = add_subclass(data) #subclass2class_table = subclass2class_conversion(data) # add group ID data, group_encoder = add_group(data) ## DATA CLEAN-UP ------------------------------------------------------------- # Basic stats on all features data_stats = data[features].describe() #print(data_stats) # how many NaNs and Infs per column data = data.replace([np.inf, -np.inf], np.nan) Nnan = data[features].isna().sum() ax = Nnan.plot(kind='bar', title='Number of NaN/Inf', grid=True) ax.set_ylabel('Number of observations with NaNs/Infs') # Drop some features with too many NaNs features.remove('freq_flatness') features.remove('snr') features.remove('uuid') # drop observations/rows with NaNs data.dropna(subset=features, axis=0, how='any', thresh=None, inplace=True) data_stats2 = data[features].describe() # ## VISUALIZATION ------------------------------------------------------------- # # box and whisker plots # data[features].plot(kind='box', subplots=True, layout=(7,7), sharex=False, sharey=False) # # histograms # data[features].hist() # # scatter plot matrix # pd.plotting.scatter_matrix(data[features]) # scatter plot PCA # pca = PCA(n_components=2) # X = pca.fit_transform(data[features]) # y = data['class_ID'] # plot_2d_space(X, y, 'Imbalanced dataset (2 PCA components)') ## SPLIT DATA INTO TRAIN & TEST SETS ------------------------------------------ n_splits = round(1 / (1 - input_args['train_ratio'])) skf = StratifiedGroupKFold(n_splits=n_splits, shuffle=True, random_state=None) for train_index, test_index in skf.split(data, data['subclass_ID'], groups=data['group_ID']): data_train, data_test = data.iloc[train_index], data.iloc[test_index] break # plot class repartition plot_datasets_distrib(data_train, data_test) plot_dataset_distrib(data, attr_list=['subclass_label', 'label_class'], title='Full dataset') plot_dataset_distrib(data_train, attr_list=['subclass_label', 'label_class'], title='Training set') plot_dataset_distrib(data_test, attr_list=['subclass_label', 'label_class'], title='Test set') # verify groups are not used in both datasets groups_intersection = plot_datasets_groups(data_train, data_test, show=True) ## CROSS VALIDATION ON TRAIN SET ---------------------------------------------- if input_args['run_CV']: # run train/test experiments cv_predictions, cv_performance = cross_validation( data_train, models, features, cv_splits=input_args['cv_splits'], cv_repeats=input_args['cv_repeats'], rebalance=input_args['rebalance_classes']) # display summary results performance_report = summarize_performance(cv_performance, threshold=0.5) print(performance_report) # plot mean Precision and Recall curves plot_PR_curves(cv_performance) plot_F_curves(cv_performance) # save results CV_results = { 'cv_predictions': cv_predictions, 'cv_performance': cv_performance, 'models': models, 'input_args': input_args, } pickle.dump( CV_results, open(os.path.join(out_dir, 'CV_' + now_str + '.sav'), 'wb')) ## FINAL EVALUATION ON TEST SET ----------------------------------------------- if input_args['train_final_model']: print(' ') print('Final evaluation on test set:') print(' ') model_name = models[model_idx][0] model = models[model_idx][1] # RF50 print(model) X_train = data_train[features] # features Y_train = data_train['class_ID'] #labels X_test = data_test[features] # features Y_test = data_test['class_ID'] #labels # feature normalization Norm_mean = X_train.mean() Norm_std = X_train.std() X_train = (X_train - Norm_mean) / Norm_std X_test = (X_test - Norm_mean) / Norm_std # Train on entire train set final_model = classification_train( X_train, Y_train, model, rebalance=input_args['rebalance_classes']) # Evaluate on full test set pred_class, pred_prob = classification_predict(X_test, final_model) # Print evaluation report CR = classification_report(Y_test, pred_class) print(CR) # save the model to disk model = { 'name': model_name, 'model': final_model, 'features': features, 'normalization_mean': Norm_mean, 'normalization_std': Norm_std, 'classes': class_encoder, 'input_args': input_args, } pickle.dump( model, open( os.path.join(out_dir, model_name + '_model_' + now_str + '.sav'), 'wb'))
""" # Define input and output files annot_file = r'C:\Users\xavier.mouy\Documents\PhD\Projects\Dectector\results\dataset_annotations_only.nc' noise_file = r'C:\Users\xavier.mouy\Documents\PhD\Projects\Dectector\results\Noise_dataset' outfile=r'C:\Users\xavier.mouy\Documents\PhD\Projects\Dectector\results\dataset_FS-NN_modified_20201105145300.nc' # Load measurements meas_annot = Measurement() meas_annot.from_netcdf(annot_file) meas_noise = Measurement() meas_noise.from_netcdf(noise_file) ## Label noise measurement as 'NN' meas_noise.insert_values(label_class='NN') print(meas_noise.summary()) ## relabel annotations that are not 'FS' as 'NN' print(meas_annot.summary()) meas_annot.data['label_class'].replace(to_replace=['', 'ANT','HS','KW','UN'], value='NN', inplace=True) print(meas_annot.summary()) ## merge the 2 datasets meas_NN_FS = meas_noise + meas_annot print(meas_NN_FS.summary()) ## Save dataset to nc file meas_NN_FS.to_netcdf(outfile)
# print('-----------------') # print(' Annotations ') # annot = Annotation() # annot.from_netcdf(annotation_file) # print(annot.summary()) # annot_perfile = annot.summary(rows='audio_file_name',columns='label_class') # annot_perfile.rename(columns={"FS": "FS-annot"}, inplace=True) # annot_perfile = annot_perfile['FS-annot'].to_frame() # #annot_perfile.to_csv('annot.csv') print(' ') print('-----------------') print(' Detections ') # load detections detec = Measurement() detec.from_netcdf(detec_file) print(detec.summary()) detec_perfile = detec.summary(rows='audio_file_name', columns='label_class') detec_perfile.rename(columns={"FS": "FS-detec"}, inplace=True) detec_perfile = detec_perfile['FS-detec'].to_frame() dd = pd.concat([annot_perfile, detec_perfile], axis=1) dd['diff'] = dd['FS-annot'] - dd['FS-detec'] dd.plot() # outdir=r'C:\Users\xavier.mouy\Documents\Workspace\GitHub\ecosound\tests\detec_export' # detec.to_pamlab(outdir, single_file=False) # outdir=r'C:\Users\xavier.mouy\Documents\Workspace\GitHub\ecosound\tests\annot_export' # annot.to_pamlab(outdir, single_file=False)