def run_experiment_WSD(train_coll, test_coll, test_frac, lt, Tpipe, labsHierarchy, out_fN, cv, clf_pipe, gs_grid, class_balance=None, metric=None, predictionsDir=None): """Runs clf experiments Parameters ---------- train_coll: list test_coll: list lt: ML.labelTransformer T_settings: list of tuples labelsHierachy: list of strings cv: cv folds estimators: list for pipline gs_grid: list out_fN: str returnClfs: dict, Flase => clfs are not stored predictionsDir: str class_balance: str name of the class to balance for metric: string or sklearn.metrics.scorer """ feExFun = Tpipe.fun #### prepare DATA: collections --> X y ## compute features dataO = fex.wavAnnCollection2datXy(train_coll, feExFun, labsHierarchy) ## prepare X y data X0, y0_names = dataO.filterInstances(lt.classes_) # filter for clf_labs if class_balance: X0, y0_names = myML.balanceToClass(X0, y0_names, class_balance) X, y_names = X0, y0_names #myML.balanceToClass(X0, y0_names, 'c') # balance classes X0, y0_names# y = lt.nom2num(y_names) #labsD = lt.targetNumNomDict() ## scores header X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_frac, random_state=0) #### CLF scoring = MLvl.get_scorer(metric) pipe = Pipeline(clf_pipe) gs = GridSearchCV(estimator=pipe, param_grid=gs_grid, scoring=scoring, cv=cv, n_jobs=-1) gs = gs.fit(X_train, y_train) ### PRINT with open(out_fN, 'a') as out_file: # print details about the dataset into status file #out_file.write("# {} ({})\n".format( collFi_train, len(train_coll))) ## samples per class out_file.write(", ".join([str(list(y_names).count(item)) for item in lt.classes_])) ## sizes of the test/train sets out_file.write(", {}, {}".format(len(X_train), len(X_test))) ## best clf scores with open(out_fN, 'a') as out_file: out_file.write('')#", {}".format(str(gs.best_params_).replace('\n', ', '), # gs.best_score_)) clf_best = gs.best_estimator_ ## clf scores over test set with open(out_fN, 'a') as out_file: ### cv score cv_sc = cross_val_score(clf_best, X_test, y_test, scoring=scoring) out_file.write(", {:2.2f}, {:.2f}".format(100*np.mean(cv_sc), 100*2*np.std(cv_sc))) ### cv accuracy cv_acc = cross_val_score(clf_best, X_test, y_test) out_file.write(", {:2.2f}, {:.2f}, ".format(100*np.mean(cv_acc), 100*2*np.std(cv_acc))) ## print R, P an f1 for each class y_true, y_pred = y_test, clf_best.predict(X_test) MLvl.print_precision_recall_fscore_support(y_true, y_pred, out_fN) ### Tpipe -- feature extraction params with open(out_fN, 'a') as out_file: settings_str = Tpipe_settings_and_header(Tpipe)[1] out_file.write(", " + settings_str+'\n')
def run_experiment_WSD( train_coll, test_coll, lt, Tpipe, labsHierarchy, out_fN, cv, clf_pipe, gs_grid, class_balance=None, metric=None, predictionsDir=None, ): """Runs clf experiments Parameters ---------- train_coll: list test_coll: list lt: ML.labelTransformer T_settings: list of tuples labelsHierachy: list of strings cv: cv folds estimators: list for pipline gs_grid: list out_fN: str returnClfs: dict, Flase => clfs are not stored predictionsDir: str class_balance: str name of the class to balance for metric: string or sklearn.metrics.scorer """ feExFun = Tpipe.fun #### prepare DATA: collections --> X y ## compute features dataO = fex.wavAnnCollection2datXy(train_coll, feExFun, labsHierarchy) ## prepare X y data X0, y0_names = dataO.filterInstances(lt.classes_) # filter for clf_labs if class_balance: X0, y0_names = myML.balanceToClass(X0, y0_names, class_balance) X, y_names = ( X0, y0_names, ) # myML.balanceToClass(X0, y0_names, 'c') # balance classes X0, y0_names# y = lt.nom2num(y_names) labsD = lt.targetNumNomDict() ## scores header X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=testFrac, random_state=0) #### CLF scoring = MLvl.get_scorer(metric) pipe = Pipeline(clf_pipe) gs = GridSearchCV(estimator=pipe, param_grid=gs_grid, scoring=scoring, cv=cv, n_jobs=-1) gs = gs.fit(X_train, y_train) ### PRINT with open( out_fN, "a" ) as out_file: # print details about the dataset into status file # out_file.write("# {} ({})\n".format( collFi_train, len(train_coll))) ## samples per class out_file.write(",".join( [str(list(y_names).count(item)) for item in lt.classes_])) ## sizes of the test/train sets out_file.write(", {}, {}".format(len(X_train), len(X_test))) ## best clf scores with open(out_fN, "a") as out_file: out_file.write( "") # ", {}".format(str(gs.best_params_).replace('\n', ', '), # gs.best_score_)) clf_best = gs.best_estimator_ ## clf scores over test set with open(out_fN, "a") as out_file: ### cv score cv_sc = cross_val_score(clf_best, X_test, y_test, scoring=scoring) out_file.write(", {:2.2f}, {:.2f}".format(100 * np.mean(cv_sc), 100 * 2 * np.std(cv_sc))) ### cv accuracy cv_acc = cross_val_score(clf_best, X_test, y_test) out_file.write(", {:2.2f}, {:.2f}, ".format(100 * np.mean(cv_acc), 100 * 2 * np.std(cv_acc))) ## print R, P an f1 for each class y_true, y_pred = y_test, clf_best.predict(X_test) MLvl.print_precision_recall_fscore_support(y_true, y_pred, out_fN) ### Tpipe -- feature extraction params with open(out_fN, "a") as out_file: settings_str = expT.Tpipe_settings_and_header(Tpipe)[1] out_file.write("," + settings_str + "\n") ### settings # settings_str = Tpipe_settings_and_header(Tpipe)[1] # out_file.write(", {}\n".format(settings_str)) """ #### TEST collection ### train classifier with whole dataset clf = skb.clone(gs.best_estimator_) # clone to create a new classifier with the same parameters clf.fit(X,y) ### print scores callIx = lt.nom2num('c') for wavF, annF in test_coll[:]: A, a_names = fex.getXy_fromWavFAnnF(wavF, annF, feExFun, labsHierarchy, filter_classes=lt.classes_) a_true = lt.nom2num(a_names) a_pred = clf.predict(A) P = mt.precision_score(a_true, a_pred, average=None)[callIx] R = mt.recall_score(a_true, a_pred, average=None)[callIx] f1 = mt.f1_score(a_true, a_pred, average=None)[callIx] with open(out_fN, 'a') as out_file: out_file.write(", {:2.2f}, {:2.2f}, {:2.2f}".format(100*f1, 100*P, 100*R)) if predictionsDir: bN = os.path.basename(annF) annFile_predict = os.path.join(predictionsDir, "{}_{}".format(int(f1*100), bN)) pT.predictSoundSections(wavF, clf, lt, feExFun, annSections=labsHierarchy, outF=annFile_predict) """ return clf_best