def train_sblc( idflabelspath, algo, outputDir="../working-directories/4-pre-trained-classifiers/", savePickle=False, ): """Train the specified algorithm on the specified dataset. Save the trained classifier into a Pickle object. Parameters ---------- idflabelspath: str Path to the dataset with identified labels algo: str Name of the supervised algorithm to use. Possible choices: {RandomForestClassifier, KNeighborsClassifier, DecisionTreeClassifier, AdaBoostClassifier, LabelSpreading} For more details, refer to the documentation of scikit-learn 0.22 outputDir: str Directory where will be stored the outputs savePickle: bool, default=False If False, the trained classifier is not saved Returns ------- Generate `algo.pkl` object in `outputDir` clf: sklearn Classifier object Trained classifier. To be used with `predict` method Example ------- >>> from blusc.supervisedfit import train_sblc >>> inputDir = "../working-directories/3-identified-labels/" >>> idfname = "IDFLABELS_2015_0219.PASSY2015_BT-T_linear_dz40_dt30_zmax2000.nc" >>> clf = train_sblc(inputDir + idfname, algo ="KNeighborsClassifier") Classifier not saved because savePickle= False >>> clf.classes_ array([0, 1, 2], dtype=int32) """ # Load dataset # ------------ X_raw, z_common, t_common, rawlabl, lablid, lablnames = utils.load_dataset( idflabelspath, variables_to_load=["X_raw", "altitude", "time", "rawlabels"], fields_to_load=["label_identification", "label_long_names"], ) # Normalization # ------------- scaler = StandardScaler() scaler.fit(X_raw) X = scaler.transform(X_raw) # Instantiate classifiers # ----------------------- if algo in ["rf", "RandomForest", "RandomForestClassifier"]: from sklearn.ensemble import RandomForestClassifier clf = RandomForestClassifier(n_estimators=50, max_depth=3) elif algo in ["knn", "nearestneighbors", "KNeighborsClassifier"]: from sklearn.neighbors import KNeighborsClassifier clf = KNeighborsClassifier(n_neighbors=6) elif algo in ["dt", "DecisionTree", "DecisionTreeClassifier"]: from sklearn.tree import DecisionTreeClassifier clf = DecisionTreeClassifier(max_depth=5) elif algo in ["ab", "adab", "AdaBoost", "AdaBoostClassifier"]: from sklearn.ensemble import AdaBoostClassifier from sklearn.tree import DecisionTreeClassifier clf = AdaBoostClassifier( base_estimator=DecisionTreeClassifier(max_depth=4), n_estimators=50 ) elif algo in ["ls", "LabelSpreading"]: from sklearn.semi_supervised import LabelSpreading clf = LabelSpreading(kernel="knn", alpha=0.2) else: raise ValueError("Not supported algorithm:", algo) # Fit supervised model # --------------------- clf.fit(X, rawlabl) # Exports # ----------- clf.label_identification_ = lablid clf.label_long_names_ = lablnames clf.scaler = scaler n_classes = clf.classes_.size centroids = np.zeros((n_classes, X.shape[1])) for k in range(n_classes): idx = np.where(rawlabl==k)[0] centroids[k,:] = np.mean(X[idx,:],axis=0) clf.training_class_centroids_ = centroids idflabelsname = idflabelspath.split("/")[-1] prefx, prepkey, dotnc = idflabelsname.split(".") dropfilename = str(clf).split("(")[0] + "." + prepkey + ".pkl" if savePickle: fc = open(outputDir + dropfilename, "wb") pickle.dump(clf, fc) fc.close() print("Trained classifier saved in ", outputDir + dropfilename) else: print("Classifier not saved because savePickle=", savePickle) return clf