コード例 #1
0
ファイル: experimentTools.py プロジェクト: caoba1/pylotwhale
def run_experiment_WSD(train_coll, test_coll, test_frac,
                       lt, Tpipe, labsHierarchy, 
                       out_fN,
                       cv, clf_pipe, gs_grid, 
                       class_balance=None, metric=None,
                       predictionsDir=None):
    """Runs clf experiments
    Parameters
    ----------
        train_coll: list
        test_coll: list
        lt: ML.labelTransformer
        T_settings: list of tuples
        labelsHierachy: list of strings
        cv: cv folds
        estimators: list
            for pipline
        gs_grid: list
                    
        out_fN: str
        returnClfs: dict, Flase => clfs are not stored
        predictionsDir: str
	class_balance: str
		name of the class to balance for
        metric: string or sklearn.metrics.scorer
    """

    feExFun = Tpipe.fun
    #### prepare DATA: collections --> X y
    ## compute features
    dataO = fex.wavAnnCollection2datXy(train_coll, feExFun, labsHierarchy)
    ## prepare X y data
    X0, y0_names = dataO.filterInstances(lt.classes_)  # filter for clf_labs
    if class_balance:
	X0, y0_names = myML.balanceToClass(X0, y0_names, class_balance)
    X, y_names = X0, y0_names #myML.balanceToClass(X0, y0_names, 'c')  # balance classes X0, y0_names#
    y = lt.nom2num(y_names)
    #labsD = lt.targetNumNomDict()
    ## scores header
    
    X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                        test_size=test_frac,
                                                        random_state=0)

    #### CLF
    scoring = MLvl.get_scorer(metric)
    pipe = Pipeline(clf_pipe)
    gs = GridSearchCV(estimator=pipe,
                      param_grid=gs_grid,
                      scoring=scoring,
                      cv=cv,
                      n_jobs=-1)

    gs = gs.fit(X_train, y_train)
    
    ### PRINT
    with open(out_fN, 'a') as out_file: # print details about the dataset into status file
        #out_file.write("# {} ({})\n".format( collFi_train, len(train_coll)))
        ## samples per class
        out_file.write(", ".join([str(list(y_names).count(item)) 
                                  for item in lt.classes_]))
        ## sizes of the test/train sets
        out_file.write(", {}, {}".format(len(X_train), len(X_test)))

    ## best clf scores
    with open(out_fN, 'a') as out_file:
        out_file.write('')#", {}".format(str(gs.best_params_).replace('\n', ', '), 
                         #                                     gs.best_score_))
    clf_best = gs.best_estimator_

    ## clf scores over test set
    with open(out_fN, 'a') as out_file:
        ### cv score
        cv_sc = cross_val_score(clf_best, X_test, y_test, scoring=scoring)
        out_file.write(", {:2.2f}, {:.2f}".format(100*np.mean(cv_sc),
                                                  100*2*np.std(cv_sc)))
        ### cv accuracy
        cv_acc = cross_val_score(clf_best, X_test, y_test)
        out_file.write(", {:2.2f}, {:.2f}, ".format(100*np.mean(cv_acc),
                                                    100*2*np.std(cv_acc)))

    ## print R, P an f1 for each class
    y_true, y_pred = y_test, clf_best.predict(X_test)                                                         
    MLvl.print_precision_recall_fscore_support(y_true, y_pred, out_fN)
    
    ### Tpipe -- feature extraction params
    with open(out_fN, 'a') as out_file:
        settings_str = Tpipe_settings_and_header(Tpipe)[1]
        out_file.write(", " + settings_str+'\n')
コード例 #2
0
def run_experiment_WSD(
    train_coll,
    test_coll,
    lt,
    Tpipe,
    labsHierarchy,
    out_fN,
    cv,
    clf_pipe,
    gs_grid,
    class_balance=None,
    metric=None,
    predictionsDir=None,
):
    """Runs clf experiments
    Parameters
    ----------
        train_coll: list
        test_coll: list
        lt: ML.labelTransformer
        T_settings: list of tuples
        labelsHierachy: list of strings
        cv: cv folds
        estimators: list
            for pipline
        gs_grid: list

        out_fN: str
        returnClfs: dict, Flase => clfs are not stored
        predictionsDir: str
    class_balance: str
        name of the class to balance for
        metric: string or sklearn.metrics.scorer
    """

    feExFun = Tpipe.fun
    #### prepare DATA: collections --> X y
    ## compute features
    dataO = fex.wavAnnCollection2datXy(train_coll, feExFun, labsHierarchy)
    ## prepare X y data
    X0, y0_names = dataO.filterInstances(lt.classes_)  # filter for clf_labs
    if class_balance:
        X0, y0_names = myML.balanceToClass(X0, y0_names, class_balance)
    X, y_names = (
        X0,
        y0_names,
    )  # myML.balanceToClass(X0, y0_names, 'c')  # balance classes X0, y0_names#
    y = lt.nom2num(y_names)
    labsD = lt.targetNumNomDict()
    ## scores header

    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=testFrac,
                                                        random_state=0)

    #### CLF
    scoring = MLvl.get_scorer(metric)
    pipe = Pipeline(clf_pipe)
    gs = GridSearchCV(estimator=pipe,
                      param_grid=gs_grid,
                      scoring=scoring,
                      cv=cv,
                      n_jobs=-1)

    gs = gs.fit(X_train, y_train)

    ### PRINT
    with open(
            out_fN, "a"
    ) as out_file:  # print details about the dataset into status file
        # out_file.write("# {} ({})\n".format( collFi_train, len(train_coll)))
        ## samples per class
        out_file.write(",".join(
            [str(list(y_names).count(item)) for item in lt.classes_]))
        ## sizes of the test/train sets
        out_file.write(", {}, {}".format(len(X_train), len(X_test)))

    ## best clf scores
    with open(out_fN, "a") as out_file:
        out_file.write(
            "")  # ", {}".format(str(gs.best_params_).replace('\n', ', '),
        #                                     gs.best_score_))
    clf_best = gs.best_estimator_

    ## clf scores over test set
    with open(out_fN, "a") as out_file:
        ### cv score
        cv_sc = cross_val_score(clf_best, X_test, y_test, scoring=scoring)
        out_file.write(", {:2.2f}, {:.2f}".format(100 * np.mean(cv_sc),
                                                  100 * 2 * np.std(cv_sc)))
        ### cv accuracy
        cv_acc = cross_val_score(clf_best, X_test, y_test)
        out_file.write(", {:2.2f}, {:.2f}, ".format(100 * np.mean(cv_acc),
                                                    100 * 2 * np.std(cv_acc)))

    ## print R, P an f1 for each class
    y_true, y_pred = y_test, clf_best.predict(X_test)
    MLvl.print_precision_recall_fscore_support(y_true, y_pred, out_fN)

    ### Tpipe -- feature extraction params
    with open(out_fN, "a") as out_file:
        settings_str = expT.Tpipe_settings_and_header(Tpipe)[1]
        out_file.write("," + settings_str + "\n")

    ### settings
    # settings_str = Tpipe_settings_and_header(Tpipe)[1]
    # out_file.write(", {}\n".format(settings_str))
    """
    #### TEST collection
    ### train classifier with whole dataset
    clf = skb.clone(gs.best_estimator_) # clone to create a new classifier with the same parameters
    clf.fit(X,y)
    ### print scores
    callIx = lt.nom2num('c')
    for wavF, annF in test_coll[:]:
        A, a_names = fex.getXy_fromWavFAnnF(wavF, annF, feExFun, labsHierarchy,
                                            filter_classes=lt.classes_)
        a_true = lt.nom2num(a_names)
        a_pred = clf.predict(A)
        P = mt.precision_score(a_true, a_pred, average=None)[callIx]
        R = mt.recall_score(a_true, a_pred, average=None)[callIx]
        f1 = mt.f1_score(a_true, a_pred, average=None)[callIx]
        with open(out_fN, 'a') as out_file:
            out_file.write(", {:2.2f}, {:2.2f}, {:2.2f}".format(100*f1,
                                                                100*P, 100*R))
        if predictionsDir:
            bN = os.path.basename(annF)
            annFile_predict = os.path.join(predictionsDir,
                                           "{}_{}".format(int(f1*100),
                                                             bN))
            pT.predictSoundSections(wavF, clf,  lt, feExFun, annSections=labsHierarchy,
                                    outF=annFile_predict)

    """

    return clf_best