コード例 #1
0
def train_sblc(
    idflabelspath,
    algo,
    outputDir="../working-directories/4-pre-trained-classifiers/",
    savePickle=False,
):
    """Train the specified algorithm on the specified dataset. Save the
    trained classifier into a Pickle object.
    
    
    Parameters
    ----------
    idflabelspath: str
        Path to the dataset with identified labels
    
    algo: str
        Name of the supervised algorithm to use. Possible choices:
        {RandomForestClassifier, KNeighborsClassifier, DecisionTreeClassifier,
        AdaBoostClassifier, LabelSpreading}
        For more details, refer to the documentation of scikit-learn 0.22
    
    outputDir: str
        Directory where will be stored the outputs
    
    savePickle: bool, default=False
        If False, the trained classifier is not saved
    
    
    Returns
    -------
    Generate `algo.pkl` object in `outputDir`
    
    clf: sklearn Classifier object
        Trained classifier. To be used with `predict` method
    
    
    Example
    -------
    >>> from blusc.supervisedfit import train_sblc
    >>> inputDir = "../working-directories/3-identified-labels/"
    >>> idfname = "IDFLABELS_2015_0219.PASSY2015_BT-T_linear_dz40_dt30_zmax2000.nc"
    >>> clf = train_sblc(inputDir + idfname, algo ="KNeighborsClassifier")
    Classifier not saved because savePickle= False
    >>> clf.classes_
    array([0, 1, 2], dtype=int32)
    """

    # Load dataset
    # ------------
    X_raw, z_common, t_common, rawlabl, lablid, lablnames = utils.load_dataset(
        idflabelspath,
        variables_to_load=["X_raw", "altitude", "time", "rawlabels"],
        fields_to_load=["label_identification", "label_long_names"],
    )

    # Normalization
    # -------------
    scaler = StandardScaler()
    scaler.fit(X_raw)
    X = scaler.transform(X_raw)
    
    # Instantiate classifiers
    # -----------------------
    if algo in ["rf", "RandomForest", "RandomForestClassifier"]:
        from sklearn.ensemble import RandomForestClassifier

        clf = RandomForestClassifier(n_estimators=50, max_depth=3)
    elif algo in ["knn", "nearestneighbors", "KNeighborsClassifier"]:
        from sklearn.neighbors import KNeighborsClassifier

        clf = KNeighborsClassifier(n_neighbors=6)
    elif algo in ["dt", "DecisionTree", "DecisionTreeClassifier"]:
        from sklearn.tree import DecisionTreeClassifier

        clf = DecisionTreeClassifier(max_depth=5)
    elif algo in ["ab", "adab", "AdaBoost", "AdaBoostClassifier"]:
        from sklearn.ensemble import AdaBoostClassifier
        from sklearn.tree import DecisionTreeClassifier

        clf = AdaBoostClassifier(
            base_estimator=DecisionTreeClassifier(max_depth=4), n_estimators=50
        )
    elif algo in ["ls", "LabelSpreading"]:
        from sklearn.semi_supervised import LabelSpreading

        clf = LabelSpreading(kernel="knn", alpha=0.2)
    else:
        raise ValueError("Not supported algorithm:", algo)

    # Fit supervised model
    # ---------------------
    clf.fit(X, rawlabl)

    # Exports
    # -----------
    clf.label_identification_ = lablid
    clf.label_long_names_ = lablnames
    clf.scaler = scaler
    
    n_classes = clf.classes_.size
    centroids = np.zeros((n_classes, X.shape[1]))
    for k in range(n_classes):
        idx = np.where(rawlabl==k)[0]
        centroids[k,:] = np.mean(X[idx,:],axis=0)
    
    clf.training_class_centroids_ = centroids

    idflabelsname = idflabelspath.split("/")[-1]
    prefx, prepkey, dotnc = idflabelsname.split(".")
    dropfilename = str(clf).split("(")[0] + "." + prepkey + ".pkl"

    if savePickle:
        fc = open(outputDir + dropfilename, "wb")
        pickle.dump(clf, fc)
        fc.close()
        print("Trained classifier saved in ", outputDir + dropfilename)
    else:
        print("Classifier not saved because savePickle=", savePickle)

    return clf