def runWSD2Experiment(train_coll,
                      test_coll,
                      lt,
                      T_settings,
                      labsHierarchy,
                      cv,
                      out_fN,
                      testColl_scoreClassLabels,
                      readSections,
                      param=None,
                      predictionsDir=None,
                      keepSections='default',
                      scoring=None):
    """Runs clf experiments
    Parameters
    ----------
        train_coll: list
        test_coll: list of (wavF, annF, template_annF)
        lt: ML.labelTransformer
        T_settings: list of tuples
        labelsHierachy: list of strings
        out_fN: str
        returnClfs: dict, Flase => clfs are not stored
        predictionsDir: str
        scoring: string or sklearn.metrics.scorer
    """

    Tpipe = fex.makeTransformationsPipeline(T_settings)
    feExFun = Tpipe.fun
    #### prepare DATA: collections --> X y
    ## compute features
    dataO = fex.wavAnnCollection2datXy(train_coll, feExFun, labsHierarchy)
    ## prepare X y data
    X0, y0_names = dataO.filterInstances(lt.classes_)  # filter for clf_labs
    X, y_names = X0, y0_names  #myML.balanceToClass(X0, y0_names, 'c')  # balance classes X0, y0_names#
    y = lt.nom2num(y_names)
    labsD = lt.targetNumNomDict()
    with open(
            out_fN, 'a'
    ) as out_file:  # print details about the dataset into status file
        out_file.write("# {} ({})\n".format(collFi_train, len(train_coll)))
        out_file.write("#label_transformer {} {}\t data {}\n".format(
            lt.targetNumNomDict(), lt.classes_, Counter(y_names)))

    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=testFrac,
                                                        random_state=0)

    with open(out_fN, 'a') as out_file:  # print details to status file
        out_file.write("#TRAIN, shape {}\n".format(np.shape(X_train)))
        out_file.write("#TEST, shape {}\n".format(np.shape(X_test)))

    #### CLF
    pipe = Pipeline(estimators)
    gs = GridSearchCV(estimator=pipe,
                      param_grid=param_grid,
                      scoring=scoring,
                      cv=cv,
                      n_jobs=-1)

    gs = gs.fit(X_train, y_train)
    ## best clf scores
    with open(out_fN, 'a') as out_file:
        out_file.write("#CLF\t{}\tbest score {:.3f}\n".format(
            str(gs.best_params_).replace('\n', ''), gs.best_score_))
    clf_best = gs.best_estimator_

    ## clf scores over test set
    with open(out_fN, 'a') as out_file:
        ### cv score
        cv_sc = cross_val_score(clf_best, X_test, y_test, scoring=scoring)
        out_file.write("{:2.2f}, {:2.2f}, {:.2f}, ".format(
            param, 100 * np.mean(cv_sc), 100 * 2 * np.std(cv_sc)))
        ### cv accuracy
        cv_acc = cross_val_score(clf_best, X_test, y_test)
        out_file.write("{:2.2f}, {:.2f}, ".format(100 * np.mean(cv_acc),
                                                  100 * 2 * np.std(cv_acc)))
    ## print R, P an f1 for each class
    y_true, y_pred = y_test, clf_best.predict(X_test)
    MLvl.print_precision_recall_fscore_support(lt.num2nom(y_true),
                                               lt.num2nom(y_pred),
                                               out_fN,
                                               labels=lt.classes_)
    #strini="\nTest Set", strend="\n")

    #### TEST collection
    ### train classifier with whole dataset
    clf = skb.clone(
        gs.best_estimator_
    )  # clone to create a new classifier with the same parameters
    clf.fit(X, y)
    ### print scores
    callIx = lt.nom2num('c')
    for wavF, annF, template_annF in test_coll[:]:
        ### print WSD2 scores, see WSD2_experiment.ipynb
        MLvl.printWSD2_scores(
            wavF,
            true_annF=annF,
            template_annF=template_annF,
            WSD2_clf=clf,
            WSD2_feExFun=feExFun,
            lt=lt,
            scoreClassLabels=testColl_scoreClassLabels,
            outF=out_fN,
            strini=", ",
            strend="",
            m='auto',
            #strini="\nTESTCOLL", strend="\n", m='auto',
            readSectionsWSD2=readSections,  # for WSD2
            labelsHierarchy=['c'])

        if predictionsDir:
            bN = os.path.basename(annF)
            annFile_predict = os.path.join(predictionsDir,
                                           "{}_{}".format(int(param), bN))
            pT.WSD2predictAnnotations(wavF,
                                      template_annF,
                                      feExFun,
                                      lt,
                                      clf,
                                      outF=annFile_predict,
                                      readSections=readSections,
                                      keepSections=keepSections)

    with open(out_fN, 'a') as out_file:
        out_file.write("\n")

    return clf
    settingsStr = "{}-{}".format(feature_str, clfStr)

    #### write in out file
    out_file = open(out_file_scores, 'a')
    out_file.write("\n###---------   {}   ---------###\n".format(
        time.strftime("%Y.%m.%d\t\t%H:%M:%S")))
    out_file.write("#" + settingsStr)
    out_file.close()

    #### load collection
    WavAnnCollection = fex.readCols(collFi_train, colIndexes=(0, 1))
    print("\ncollection:", len(WavAnnCollection), "\nlast file:",
          WavAnnCollection[-1])

    #### compute features
    trainDat = fex.wavAnnCollection2datXy(
        WavAnnCollection, feExFun)  #, wavPreprocesingT=wavPreprocessingFun)
    ## y_names train and test data
    X, y_names = trainDat.filterInstances(labs)  # train
    lt = myML.labelTransformer(labs)
    y = lt.nom2num(y_names)
    labsD = lt.targetNumNomDict()
    #### train/test split
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=testFrac,
                                                        random_state=0)

    with open(out_file_scores,
              'a') as out_file:  # print details to status file
        out_file.write("\n#TRAIN, shape {}\n".format(np.shape(X_train)))
        out_file.write("#TEST, shape {}\n".format(np.shape(X_test)))
Exemple #3
0
summDict = {"n_textWS": 20, "normalise": True}
summType = "walking"
T_settings.append(("summ", (summType, summDict)))

Tpipe = fex.makeTransformationsPipeline(T_settings)

##### clf settings
testFrac = 0.2
clf_labs = ["b", "c", "w"]
labsHierarchy = ["c", "w"]

feExFun = Tpipe.fun

# In[ ]:

dataO = fex.wavAnnCollection2datXy(collList, feExFun, labsHierarchy)

# In[ ]:

X0, y0_names = dataO.filterInstances(lt.classes_)  # filter for clf_labs
X, y_names = X0, y0_names  # myML.balanceToClass(X0, y0_names, 'c')  # balance classes X0, y0_names#
y = lt.nom2num(y_names)
labsD = lt.targetNumNomDict()
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.2,
                                                    random_state=0)

# In[ ]:

Exemple #4
0
def run_experiment_WSD(train_coll, test_coll, test_frac,
                       lt, Tpipe, labsHierarchy, 
                       out_fN,
                       cv, clf_pipe, gs_grid, 
                       class_balance=None, metric=None,
                       predictionsDir=None):
    """Runs clf experiments
    Parameters
    ----------
        train_coll: list
        test_coll: list
        lt: ML.labelTransformer
        T_settings: list of tuples
        labelsHierachy: list of strings
        cv: cv folds
        estimators: list
            for pipline
        gs_grid: list
                    
        out_fN: str
        returnClfs: dict, Flase => clfs are not stored
        predictionsDir: str
	class_balance: str
		name of the class to balance for
        metric: string or sklearn.metrics.scorer
    """

    feExFun = Tpipe.fun
    #### prepare DATA: collections --> X y
    ## compute features
    dataO = fex.wavAnnCollection2datXy(train_coll, feExFun, labsHierarchy)
    ## prepare X y data
    X0, y0_names = dataO.filterInstances(lt.classes_)  # filter for clf_labs
    if class_balance:
	X0, y0_names = myML.balanceToClass(X0, y0_names, class_balance)
    X, y_names = X0, y0_names #myML.balanceToClass(X0, y0_names, 'c')  # balance classes X0, y0_names#
    y = lt.nom2num(y_names)
    #labsD = lt.targetNumNomDict()
    ## scores header
    
    X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                        test_size=test_frac,
                                                        random_state=0)

    #### CLF
    scoring = MLvl.get_scorer(metric)
    pipe = Pipeline(clf_pipe)
    gs = GridSearchCV(estimator=pipe,
                      param_grid=gs_grid,
                      scoring=scoring,
                      cv=cv,
                      n_jobs=-1)

    gs = gs.fit(X_train, y_train)
    
    ### PRINT
    with open(out_fN, 'a') as out_file: # print details about the dataset into status file
        #out_file.write("# {} ({})\n".format( collFi_train, len(train_coll)))
        ## samples per class
        out_file.write(", ".join([str(list(y_names).count(item)) 
                                  for item in lt.classes_]))
        ## sizes of the test/train sets
        out_file.write(", {}, {}".format(len(X_train), len(X_test)))

    ## best clf scores
    with open(out_fN, 'a') as out_file:
        out_file.write('')#", {}".format(str(gs.best_params_).replace('\n', ', '), 
                         #                                     gs.best_score_))
    clf_best = gs.best_estimator_

    ## clf scores over test set
    with open(out_fN, 'a') as out_file:
        ### cv score
        cv_sc = cross_val_score(clf_best, X_test, y_test, scoring=scoring)
        out_file.write(", {:2.2f}, {:.2f}".format(100*np.mean(cv_sc),
                                                  100*2*np.std(cv_sc)))
        ### cv accuracy
        cv_acc = cross_val_score(clf_best, X_test, y_test)
        out_file.write(", {:2.2f}, {:.2f}, ".format(100*np.mean(cv_acc),
                                                    100*2*np.std(cv_acc)))

    ## print R, P an f1 for each class
    y_true, y_pred = y_test, clf_best.predict(X_test)                                                         
    MLvl.print_precision_recall_fscore_support(y_true, y_pred, out_fN)
    
    ### Tpipe -- feature extraction params
    with open(out_fN, 'a') as out_file:
        settings_str = Tpipe_settings_and_header(Tpipe)[1]
        out_file.write(", " + settings_str+'\n')
Exemple #5
0
def runExperiment(
    train_coll,
    test_coll,
    lt,
    T_settings,
    labsHierarchy,
    out_fN,
    cv,
    pipe_estimators,
    gs_grid,
    scoring=None,
    param=None,
    predictionsDir=None,
):
    """Runs clf experiments
    Parameters
    ----------
        train_coll: list
        test_coll: list
        lt: ML.labelTransformer
        T_settings: list of tuples
        labelsHierachy: list of strings
        cv: cv folds
        pipe_estimators: list
            for pipline
        gs_grid: list

        out_fN: str
        returnClfs: dict, Flase => clfs are not stored
        predictionsDir: str
        scoring: string or sklearn.metrics.scorer
    """

    Tpipe = fex.makeTransformationsPipeline(T_settings)
    feExFun = Tpipe.fun
    #### prepare DATA: collections --> X y
    ## compute features
    dataO = fex.wavAnnCollection2datXy(train_coll, feExFun, labsHierarchy)
    ## prepare X y data
    X0, y0_names = dataO.filterInstances(lt.classes_)  # filter for clf_labs
    X, y_names = (
        X0,
        y0_names,
    )  # myML.balanceToClass(X0, y0_names, 'c')  # balance classes X0, y0_names#
    y = lt.nom2num(y_names)
    labsD = lt.targetNumNomDict()
    with open(
            out_fN, "a"
    ) as out_file:  # print details about the dataset into status file
        out_file.write("# {} ({})\n".format(collFi_train, len(train_coll)))
        out_file.write("#label_transformer {} {}\t data {}\n".format(
            lt.targetNumNomDict(), lt.classes_, Counter(y_names)))

    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=testFrac,
                                                        random_state=0)

    with open(out_fN, "a") as out_file:  # print details to status file
        out_file.write("#TRAIN, shape {}\n".format(np.shape(X_train)))
        out_file.write("#TEST, shape {}\n".format(np.shape(X_test)))

    #### CLF
    pipe = Pipeline(pipe_estimators)
    gs = GridSearchCV(estimator=pipe,
                      param_grid=gs_grid,
                      scoring=scoring,
                      cv=cv,
                      n_jobs=-1)

    gs = gs.fit(X_train, y_train)
    ## best clf scores
    with open(out_fN, "a") as out_file:
        out_file.write("#CLF\t{}\tbest score {:.3f}\n".format(
            str(gs.best_params_).replace("\n", ""), gs.best_score_))
    clf_best = gs.best_estimator_

    ## clf scores over test set
    with open(out_fN, "a") as out_file:
        ### cv score
        cv_sc = cross_val_score(clf_best, X_test, y_test, scoring=scoring)
        out_file.write("{:2.2f}, {:2.2f}, {:.2f}, ".format(
            param, 100 * np.mean(cv_sc), 100 * 2 * np.std(cv_sc)))
        ### cv accuracy
        cv_acc = cross_val_score(clf_best, X_test, y_test)
        out_file.write("{:2.2f}, {:.2f}, ".format(100 * np.mean(cv_acc),
                                                  100 * 2 * np.std(cv_acc)))
    ## print R, P an f1 for each class
    y_true, y_pred = y_test, clf_best.predict(X_test)
    MLvl.print_precision_recall_fscore_support(y_true, y_pred, out_fN)

    #### TEST collection
    ### train classifier with whole dataset
    clf = skb.clone(
        gs.best_estimator_
    )  # clone to create a new classifier with the same parameters
    clf.fit(X, y)
    ### print scores
    callIx = lt.nom2num("c")
    for wavF, annF in test_coll[:]:
        A, a_names = fex.getXy_fromWavFAnnF(wavF,
                                            annF,
                                            feExFun,
                                            labsHierarchy,
                                            filter_classes=lt.classes_)
        a_true = lt.nom2num(a_names)
        a_pred = clf.predict(A)
        P = mt.precision_score(a_true, a_pred, average=None)[callIx]
        R = mt.recall_score(a_true, a_pred, average=None)[callIx]
        f1 = mt.f1_score(a_true, a_pred, average=None)[callIx]
        with open(out_fN, "a") as out_file:
            out_file.write(", {:2.2f}, {:2.2f}, {:2.2f}".format(
                100 * f1, 100 * P, 100 * R))
        if predictionsDir:
            bN = os.path.basename(annF)
            annFile_predict = os.path.join(
                predictionsDir, "{}_{}_{}".format(int(f1 * 100), int(param),
                                                  bN))
            pT.predictSoundSections(wavF,
                                    clf,
                                    lt,
                                    feExFun,
                                    annSections=labsHierarchy,
                                    outF=annFile_predict)

    with open(out_fN, "a") as out_file:
        out_file.write("\n")

    return clf
Exemple #6
0
def run_experiment_WSD(
    train_coll,
    test_coll,
    lt,
    Tpipe,
    labsHierarchy,
    out_fN,
    cv,
    clf_pipe,
    gs_grid,
    class_balance=None,
    metric=None,
    predictionsDir=None,
):
    """Runs clf experiments
    Parameters
    ----------
        train_coll: list
        test_coll: list
        lt: ML.labelTransformer
        T_settings: list of tuples
        labelsHierachy: list of strings
        cv: cv folds
        estimators: list
            for pipline
        gs_grid: list

        out_fN: str
        returnClfs: dict, Flase => clfs are not stored
        predictionsDir: str
    class_balance: str
        name of the class to balance for
        metric: string or sklearn.metrics.scorer
    """

    feExFun = Tpipe.fun
    #### prepare DATA: collections --> X y
    ## compute features
    dataO = fex.wavAnnCollection2datXy(train_coll, feExFun, labsHierarchy)
    ## prepare X y data
    X0, y0_names = dataO.filterInstances(lt.classes_)  # filter for clf_labs
    if class_balance:
        X0, y0_names = myML.balanceToClass(X0, y0_names, class_balance)
    X, y_names = (
        X0,
        y0_names,
    )  # myML.balanceToClass(X0, y0_names, 'c')  # balance classes X0, y0_names#
    y = lt.nom2num(y_names)
    labsD = lt.targetNumNomDict()
    ## scores header

    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=testFrac,
                                                        random_state=0)

    #### CLF
    scoring = MLvl.get_scorer(metric)
    pipe = Pipeline(clf_pipe)
    gs = GridSearchCV(estimator=pipe,
                      param_grid=gs_grid,
                      scoring=scoring,
                      cv=cv,
                      n_jobs=-1)

    gs = gs.fit(X_train, y_train)

    ### PRINT
    with open(
            out_fN, "a"
    ) as out_file:  # print details about the dataset into status file
        # out_file.write("# {} ({})\n".format( collFi_train, len(train_coll)))
        ## samples per class
        out_file.write(",".join(
            [str(list(y_names).count(item)) for item in lt.classes_]))
        ## sizes of the test/train sets
        out_file.write(", {}, {}".format(len(X_train), len(X_test)))

    ## best clf scores
    with open(out_fN, "a") as out_file:
        out_file.write(
            "")  # ", {}".format(str(gs.best_params_).replace('\n', ', '),
        #                                     gs.best_score_))
    clf_best = gs.best_estimator_

    ## clf scores over test set
    with open(out_fN, "a") as out_file:
        ### cv score
        cv_sc = cross_val_score(clf_best, X_test, y_test, scoring=scoring)
        out_file.write(", {:2.2f}, {:.2f}".format(100 * np.mean(cv_sc),
                                                  100 * 2 * np.std(cv_sc)))
        ### cv accuracy
        cv_acc = cross_val_score(clf_best, X_test, y_test)
        out_file.write(", {:2.2f}, {:.2f}, ".format(100 * np.mean(cv_acc),
                                                    100 * 2 * np.std(cv_acc)))

    ## print R, P an f1 for each class
    y_true, y_pred = y_test, clf_best.predict(X_test)
    MLvl.print_precision_recall_fscore_support(y_true, y_pred, out_fN)

    ### Tpipe -- feature extraction params
    with open(out_fN, "a") as out_file:
        settings_str = expT.Tpipe_settings_and_header(Tpipe)[1]
        out_file.write("," + settings_str + "\n")

    ### settings
    # settings_str = Tpipe_settings_and_header(Tpipe)[1]
    # out_file.write(", {}\n".format(settings_str))
    """
    #### TEST collection
    ### train classifier with whole dataset
    clf = skb.clone(gs.best_estimator_) # clone to create a new classifier with the same parameters
    clf.fit(X,y)
    ### print scores
    callIx = lt.nom2num('c')
    for wavF, annF in test_coll[:]:
        A, a_names = fex.getXy_fromWavFAnnF(wavF, annF, feExFun, labsHierarchy,
                                            filter_classes=lt.classes_)
        a_true = lt.nom2num(a_names)
        a_pred = clf.predict(A)
        P = mt.precision_score(a_true, a_pred, average=None)[callIx]
        R = mt.recall_score(a_true, a_pred, average=None)[callIx]
        f1 = mt.f1_score(a_true, a_pred, average=None)[callIx]
        with open(out_fN, 'a') as out_file:
            out_file.write(", {:2.2f}, {:2.2f}, {:2.2f}".format(100*f1,
                                                                100*P, 100*R))
        if predictionsDir:
            bN = os.path.basename(annF)
            annFile_predict = os.path.join(predictionsDir,
                                           "{}_{}".format(int(f1*100),
                                                             bN))
            pT.predictSoundSections(wavF, clf,  lt, feExFun, annSections=labsHierarchy,
                                    outF=annFile_predict)

    """

    return clf_best