def test_partial_dependence_no_shadowing(): # Non-regression test for: # https://github.com/scikit-learn/scikit-learn/issues/15842 with warnings.catch_warnings(): warnings.simplefilter("ignore", category=FutureWarning) from sklearn.inspection.partial_dependence import partial_dependence as _ # noqa # Calling all_estimators() also triggers a recursive import of all # submodules, including deprecated ones. all_estimators() from sklearn.inspection import partial_dependence assert isinstance(partial_dependence, types.FunctionType)
def fit(self, train_x, train_y, folds=3): estimators = all_estimators(type_filter="classifier") for name, ClassifierClass in estimators: if name in model_param_map: param_grid = model_param_map[name] grid_clf = GridSearchCV( ClassifierClass(), param_grid, cv=folds, scoring="accuracy", verbose=0, n_jobs=-1, ) start = time.time() grid_clf.fit(train_x, train_y) end = time.time() clf = SimpleClassifier() clf.metrics["Training Accuracy"] = grid_clf.best_score_ pred_y = grid_clf.predict(train_x) clf.metrics["Jaccard Score"] = jaccard_score(train_y, pred_y, average="macro") clf.metrics["F1 Score"] = f1_score(train_y, pred_y, average="macro") clf.sk_model = grid_clf.best_estimator_ clf.name = name clf.attributes = grid_clf.best_params_ clf.train_duration = grid_clf.refit_time_ clf.gridsearch_duration = end - start self.ranked_list.append(clf) metrik = lambda clf: clf.metrics[self.metric] self.ranked_list.sort(reverse=True, key=metrik)
def quick_test(self, filter_type="classifier", max_threads=5, save=True): print("*Quick test for multiple classification models!") threads = [] for name, estimator_class in all_estimators(filter_type): print(f"*start training: {name} model.") try: model = estimator_class() thread = TrainModelThread( self.train_X, self.train_y, self.test_X, self.test_y, model, filter_type, name, save, ) threads.append(thread) thread.start() except: print(f"*Failed to initialize model: {name}.") for thread in threads: thread.join() print("*Training of all classification models are finished!")
def Cvalidation(iris_data): y = iris_data.loc[:,"Name"] x = iris_data.loc[:,["SepalLength","SepalWidth","PetalLength","PetalWidth"]] # classifierのアルゴリズムすべてを取得する warnings.filterwarnings("ignore") allAlgorithms = all_estimators(type_filter="classifier") # K分割クロスバリテーション用オブジェクト kfold_cv = KFold(n_splits=5, shuffle=True) for(name,algorithm) in allAlgorithms: try : # 各アリゴリズムのオブジェクトを作成 if(name == "LinearSVC") : clf = algorithm(max_iter = 10000) else: clf = algorithm() # scoreメソッドを持つクラスを対象とする if hasattr(clf,"score"): # クロスバリデーションを行う scores = cross_val_score(clf,x,y,cv=kfold_cv) print(name,"の正解率=") print(scores) except Exception as e : pass
def Salgorithm(iris_data): # アヤメデータをラベルと入力データに分離 y = iris_data.loc[:,"Name"] x = iris_data.loc[:,["SepalLength","SepalWidth","PetalLength","PetalWidth"]] # 学習用とテスト用に分離する x_train,X_test,y_train,Y_test = train_test_split(x,y,test_size = 0.2,train_size = 0.8,shuffle = True) # classifierのアルゴリズムすべてを取得する warnings.filterwarnings("ignore") allAlgorithms = all_estimators(type_filter="classifier") for(name,algorithm) in allAlgorithms: try : # 各アリゴリズムのオブジェクトを作成 clf = algorithm() # 学習して、評価する clf.fit(x_train, y_train) y_pred = clf.predict(X_test) print(name,"の正解率 = " , accuracy_score(Y_test, y_pred)) # WarningやExceptionの内容を表示する except Warning as w : print("\033[33m"+"Warning:"+"\033[0m", name, ":", w.args) except Exception as e : #print("\033[31m"+"Error:"+"\033[0m", name, ":", e.args) pass
def fit(self, train_x, train_y, folds=3): """ Trains all regressors from parameter grid by running model algorithm search. Creates a ranked list of models based on selected scoring metric. Parameters ---------- train_x : numpy.ndarray The features for training regression model train_y : numpy.ndarray The corresponding label for feature array folds : int, optional The number of folds for cross validation """ estimators = all_estimators(type_filter="regressor") with tqdm( total=(len(model_param_map)), desc="Creating Regressor List", unit=" Regressor", ncols=100, ) as progressbar: for name, RegressionClass in estimators: if name in model_param_map: param_grid = model_param_map[name] grid_rgr = GridSearchCV( RegressionClass(), param_grid, cv=folds, scoring="neg_root_mean_squared_error", verbose=0, n_jobs=-1, error_score="raise", ) progressbar.update(1) start = time.time() try: grid_rgr.fit(train_x, train_y) except BaseException as error: self.logger.warning( f"{name} failed due to, Error : {error}.") continue end = time.time() rgr = SimpleRegressor() rgr.metrics["Training Score"] = -grid_rgr.best_score_ pred_y = grid_rgr.predict(train_x) rgr.metrics["Mean Absolute Error"] = mean_absolute_error( train_y, pred_y) rgr.metrics["Mean Squared Error"] = mean_squared_error( train_y, pred_y) rgr.metrics["R-Squared"] = r2_score(train_y, pred_y) rgr.sk_model = grid_rgr.best_estimator_ rgr.name = name rgr.attributes = grid_rgr.best_params_ rgr.train_duration = grid_rgr.refit_time_ rgr.gridsearch_duration = end - start self.ranked_list.append(rgr) metrik = lambda rgr: rgr.metrics[self.metric] self.ranked_list.sort(reverse=False, key=metrik)
def fit(self, train_x, train_y, folds=3): estimators = all_estimators(type_filter="classifier") for name, ClassifierClass in estimators: if name in model_param_map: param_grid = model_param_map[name] grid_clf = GridSearchCV( ClassifierClass(), param_grid, cv=folds, scoring="accuracy", verbose=0, n_jobs=-1, ) start = time.time() grid_clf.fit(train_x, train_y) end = time.time() if grid_clf.best_score_ > self.metrics.get( "Training Accuracy", 0.0): self.metrics["Training Accuracy"] = grid_clf.best_score_ pred_y = grid_clf.predict(train_x) self.metrics["Jaccard Score"] = jaccard_score( train_y, pred_y, average="macro") self.metrics["F1 Score"] = f1_score(train_y, pred_y, average="macro") self.sk_model = grid_clf.best_estimator_ self.name = name self.attributes = grid_clf.best_params_ self.train_duration = grid_clf.refit_time_ self.gridsearch_duration = end - start
def quick_test(self, filter_type="classifier", max_threads=5, save=True): label_df = pd.read_csv("data/train_label.csv", index_col="arrival_date") print("*Quick test for multiple classification models!") threads = [] for name, estimator_class in all_estimators(filter_type): print(f"*start training: {name} model.") model = estimator_class() try: model = estimator_class() thread = TrainModelThread2( self.X_df.copy(), self.X_train.copy(), self.y_train.copy(), self.X_test.copy(), self.y_test.copy(), label_df.copy(), model, filter_type, name, save, ) threads.append(thread) thread.start() if len(threads) > 5: break except: print(f"*Failed to initialize model: {name}.") for thread in threads: thread.join() print("*Training of all classification models are finished!")
def _all_estimators(): try: from sklearn.utils import all_estimators return all_estimators() except ImportError: return _backported_all_estimators()
def test_all_estimator_no_base_class(): # test that all_estimators doesn't find abstract classes. for name, Estimator in all_estimators(): msg = ( "Base estimators such as {0} should not be included in all_estimators" ).format(name) assert not name.lower().startswith("base"), msg
def make_paragraph_for_estimator_type(estimator_type): intro = nodes.list_item() intro += nodes.strong( text="Estimators that allow NaN values for type ") intro += nodes.literal(text=f"{estimator_type}") intro += nodes.strong(text=":\n") exists = False lst = nodes.bullet_list() for name, est_class in all_estimators(type_filter=estimator_type): with suppress(SkipTest): est = _construct_instance(est_class) if est._get_tags().get("allow_nan"): module_name = ".".join(est_class.__module__.split(".")[:2]) class_title = f"{est_class.__name__}" class_url = f"generated/{module_name}.{class_title}.html" item = nodes.list_item() para = nodes.paragraph() para += nodes.reference(class_title, text=class_title, internal=False, refuri=class_url) exists = True item += para lst += item intro += lst return [intro] if exists else None
def getClassifiers(): from sklearn.utils import all_estimators import sklearn import xgboost as xgb estimators = all_estimators() classifiers = [] classifiers.append(sklearn.ensemble._bagging.BaggingClassifier()) # OK classifiers.append(sklearn.tree._classes.DecisionTreeClassifier()) # OK classifiers.append(sklearn.ensemble._forest.ExtraTreesClassifier()) # OK classifiers.append(sklearn.naive_bayes.BernoulliNB()) # OK classifiers.append(sklearn.ensemble._hist_gradient_boosting.gradient_boosting.HistGradientBoostingClassifier()) # OK classifiers.append(xgb.XGBClassifier()) classifiers.append(sklearn.linear_model._logistic.LogisticRegressionCV()) # OK classifiers.append(sklearn.tree._classes.ExtraTreeClassifier()) # OK classifiers.append(sklearn.ensemble._forest.RandomForestClassifier()) # OK classifiers.append(sklearn.linear_model._logistic.LogisticRegression()) # OK #classifiers.append(sklearn.ensemble._gb.GradientBoostingClassifier()) # OK tar lång tid c = [] for classifier in classifiers: c.append(type(classifier).__name__) return c
def reg_dict(): _all_regressors = {} estimators = all_estimators() for name, class_ in estimators: if issubclass(class_, base.RegressorMixin): _all_regressors[name] = class_ return _all_regressors
def getClassifiers(): from sklearn.utils import all_estimators import sklearn import xgboost as xgb estimators = all_estimators() classifiers = [] classifiers.append(sklearn.ensemble._gb.GradientBoostingClassifier()) # OK #classifiers.append(sklearn.neighbors._classification.KNeighborsClassifier()) # OK classifiers.append(sklearn.linear_model._logistic.LogisticRegressionCV()) # OK classifiers.append(sklearn.svm._classes.NuSVC()) # OK classifiers.append(sklearn.gaussian_process._gpc.GaussianProcessClassifier()) # OK classifiers.append(sklearn.discriminant_analysis.QuadraticDiscriminantAnalysis()) # OK classifiers.append(sklearn.linear_model._logistic.LogisticRegression()) # OK classifiers.append(xgb.XGBClassifier()) classifiers.append(sklearn.naive_bayes.BernoulliNB()) # OK classifiers.append(sklearn.svm._classes.SVC()) # OK Men kolla output! Verkar vara ensidig... classifiers.append(sklearn.tree._classes.DecisionTreeClassifier()) # OK classifiers.append(sklearn.calibration.CalibratedClassifierCV(base_estimator=sklearn.ensemble._weight_boosting.AdaBoostClassifier())) # OK classifiers.append(sklearn.linear_model._stochastic_gradient.SGDClassifier()) # OK classifiers.append(sklearn.naive_bayes.GaussianNB()) # OK classifiers.append(sklearn.neural_network._multilayer_perceptron.MLPClassifier()) # OK classifiers.append(sklearn.multiclass.OneVsRestClassifier(sklearn.ensemble._weight_boosting.AdaBoostClassifier())) # OK classifiers.append(sklearn.ensemble._forest.RandomForestClassifier()) # OK classifiers.append(sklearn.tree._classes.ExtraTreeClassifier()) # OK classifiers.append(sklearn.ensemble._forest.ExtraTreesClassifier()) # OK classifiers.append(sklearn.discriminant_analysis.LinearDiscriminantAnalysis()) # OK classifiers.append(sklearn.ensemble._hist_gradient_boosting.gradient_boosting.HistGradientBoostingClassifier()) # OK c = [] for classifier in classifiers: c.append(type(classifier).__name__) return c
def _generate_meta_estimator_instances_with_pipeline(): """Generate instances of meta-estimators fed with a pipeline Are considered meta-estimators all estimators accepting one of "estimator", "base_estimator" or "estimators". """ for _, Estimator in sorted(all_estimators()): sig = set(signature(Estimator).parameters) if "estimator" in sig or "base_estimator" in sig or "regressor" in sig: if is_regressor(Estimator): estimator = make_pipeline(TfidfVectorizer(), Ridge()) param_grid = {"ridge__alpha": [0.1, 1.0]} else: estimator = make_pipeline(TfidfVectorizer(), LogisticRegression()) param_grid = {"logisticregression__C": [0.1, 1.0]} if "param_grid" in sig or "param_distributions" in sig: # SearchCV estimators extra_params = {"n_iter": 2} if "n_iter" in sig else {} yield Estimator(estimator, param_grid, **extra_params) else: yield Estimator(estimator) elif "transformer_list" in sig: # FeatureUnion transformer_list = [ ("trans1", make_pipeline(TfidfVectorizer(), MaxAbsScaler())), ( "trans2", make_pipeline(TfidfVectorizer(), StandardScaler(with_mean=False)), ), ] yield Estimator(transformer_list) elif "estimators" in sig: # stacking, voting if is_regressor(Estimator): estimator = [ ("est1", make_pipeline(TfidfVectorizer(), Ridge(alpha=0.1))), ("est2", make_pipeline(TfidfVectorizer(), Ridge(alpha=1))), ] else: estimator = [ ( "est1", make_pipeline(TfidfVectorizer(), LogisticRegression(C=0.1)), ), ("est2", make_pipeline(TfidfVectorizer(), LogisticRegression(C=1))), ] yield Estimator(estimator) else: continue
def _tested_estimators(): for name, Estimator in all_estimators(): try: estimator = _construct_instance(Estimator) except SkipTest: continue yield estimator
def _tested_estimators(type_filter=None): for name, Estimator in all_estimators(type_filter=type_filter): try: estimator = _construct_instance(Estimator) except SkipTest: continue yield estimator
def test_all_estimators_all_public(): # all_estimator should not fail when pytest is not installed and return # only public estimators with warnings.catch_warnings(record=True) as record: estimators = all_estimators() # no warnings are raised assert not record for est in estimators: assert not est.__class__.__name__.startswith("_")
def _tested_estimators(): for name, Estimator in all_estimators(): if issubclass(Estimator, BiclusterMixin): continue try: estimator = _construct_instance(Estimator) except SkipTest: continue yield estimator
def fit(self, train_x, train_y, folds=3): """Trains all classification models from parameter grid by running model algorithm search. Creates a ranked list of models based on selected scoring metric. Parameters ---------- train_x : numpy.ndarray The features for training classification model train_y : numpy.ndarray The corresponding label for feature array folds : int, optional The number of folds for cross validation """ estimators = all_estimators(type_filter="classifier") for name, ClassifierClass in estimators: if name in model_param_map: param_grid = model_param_map[name] grid_clf = GridSearchCV( ClassifierClass(), param_grid, cv=folds, scoring="accuracy", verbose=0, n_jobs=-1, ) start = time.time() try: grid_clf.fit(train_x, train_y) except BaseException as error: self.logger.warning( f"{name} failed due to, Error : {error}.") continue end = time.time() clf = SimpleClassifier() clf.metrics["Training Accuracy"] = grid_clf.best_score_ pred_y = grid_clf.predict(train_x) clf.metrics["Jaccard Score"] = jaccard_score(train_y, pred_y, average="macro") clf.metrics["F1 Score"] = f1_score(train_y, pred_y, average="macro") clf.sk_model = grid_clf.best_estimator_ clf.name = name clf.attributes = grid_clf.best_params_ clf.train_duration = grid_clf.refit_time_ clf.gridsearch_duration = end - start self.ranked_list.append(clf) metrik = lambda clf: clf.metrics[self.metric] self.ranked_list.sort(reverse=True, key=metrik)
def fit(self, train_x, train_y, folds=3): """Trains the optimal regression model on given dataset by running model algorithm search. If the argument folds isn't passed, the default value(3) is used. Parameters ---------- train_x : numpy.ndarray The features for training classification model train_y : numpy.ndarray The corresponding label for feature array folds : int, optional The number of folds for cross validation """ estimators = all_estimators(type_filter="regressor") for name, RegressionClass in estimators: if name in model_param_map: param_grid = model_param_map[name] grid_rgr = GridSearchCV( RegressionClass(), param_grid, cv=folds, scoring="neg_root_mean_squared_error", verbose=0, n_jobs=-1, error_score="raise", ) start = time.time() try: grid_rgr.fit(train_x, train_y) except BaseException as error: self.failed_models.append(name) self.logger.warning( f"{name} failed due to, Error : {error}.") continue end = time.time() if self.metrics.get( "Training Score" ) is None or -grid_rgr.best_score_ < self.metrics.get( "Training Score"): self.metrics["Training Score"] = -grid_rgr.best_score_ pred_y = grid_rgr.predict(train_x) self.metrics["Mean Absolute Error"] = mean_absolute_error( train_y, pred_y) self.metrics["Mean Square Error"] = mean_squared_error( train_y, pred_y) self.metrics["R-Squared"] = r2_score(train_y, pred_y) self.sk_model = grid_rgr.best_estimator_ self.name = name self.attributes = grid_rgr.best_params_ self.train_duration = grid_rgr.refit_time_ self.gridsearch_duration = end - start
def fit(self, train_x, train_y, folds=3): """Trains the optimal classification model on given dataset by running model algorithm search. If the argument folds isn't passed, the default value(3) is used. Parameters ---------- train_x : numpy.ndarray The features for training classification model train_y : numpy.ndarray The corresponding label for feature array folds : int, optional The number of folds for cross validation """ estimators = all_estimators(type_filter="classifier") for name, ClassifierClass in estimators: if name in model_param_map: param_grid = model_param_map[name] grid_clf = GridSearchCV( ClassifierClass(), param_grid, cv=folds, scoring="accuracy", verbose=0, n_jobs=-1, ) start = time.time() try: grid_clf.fit(train_x, train_y) except BaseException as error: self.failed_models.append(name) self.logger.warning( f"{name} failed due to, Error : {error}.") continue end = time.time() if grid_clf.best_score_ > self.metrics.get( "Training Accuracy", 0.0): self.metrics["Training Accuracy"] = grid_clf.best_score_ pred_y = grid_clf.predict(train_x) self.metrics["Jaccard Score"] = jaccard_score( train_y, pred_y, average="macro") self.metrics["F1 Score"] = f1_score(train_y, pred_y, average="macro") self.sk_model = grid_clf.best_estimator_ self.name = name self.attributes = grid_clf.best_params_ self.train_duration = grid_clf.refit_time_ self.gridsearch_duration = end - start
def valid_components(self): """Find all supported regressors. Returns: valid_components: numpy.array([[regressor name, object], ...]) Valid regressors """ if not hasattr(self, "valid_components_r"): regressors = np.array([est for est in all_estimators() if issubclass(est[1], RegressorMixin)]) self.valid_components_r = regressors return self.valid_components_r
def _tested_linear_classifiers(): classifiers = all_estimators(type_filter="classifier") with warnings.catch_warnings(record=True): for name, clazz in classifiers: required_parameters = getattr(clazz, "_required_parameters", []) if len(required_parameters): # FIXME continue if "class_weight" in clazz().get_params().keys() and issubclass( clazz, LinearClassifierMixin): yield name, clazz
def valid_components(self): """Find all supported classifiers. Returns: valid_components: numpy.array([[classifier name, object], ...]) Valid classifiers """ if not hasattr(self, "valid_components_c"): classifiers = np.array([ est for est in all_estimators() if issubclass(est[1], ClassifierMixin) ]) self.valid_components_c = classifiers return self.valid_components_c
def get_all_methods(): estimators = all_estimators() for name, Estimator in estimators: if name.startswith("_"): # skip private classes continue methods = [] for name in dir(Estimator): if name.startswith("_"): continue method_obj = getattr(Estimator, name) if hasattr(method_obj, "__call__") or isinstance(method_obj, property): methods.append(name) methods.append(None) for method in sorted(methods, key=lambda x: str(x)): yield Estimator, method
def generate_curriculum(self, X, y, path, method): X = np.array(X) y = np.array(y) if method == 'kdn': score = self.kdn_score(X, y, 50) curriculum_df = pd.DataFrame(score, columns=['score']) curriculum_df.reset_index(inplace=True) curriculum_df.to_csv(path, index=False) elif method == 'faiss_kdn': score = FaissKNNClassifier().faiss_kdn_score(X, y, 50) curriculum_df = pd.DataFrame(score, columns=['score']) curriculum_df.reset_index(inplace=True) curriculum_df.to_csv(path, index=False) elif method == 'gmm': curriculum_df = self.GMM_IH(X, y) curriculum_df.to_csv(path, index=False) elif method == 'ensemble': estimators = all_estimators(type_filter='classifier') clf_l = ["RandomForestClassifier", "MLPClassifier", "SVC"] classifiers = [] for name, class_ in estimators: if hasattr(class_, 'predict_proba') and name in clf_l: if name == "SVC": clf = class_(probability=True) else: clf = class_() classifiers.append(clf) estimator = VotingClassifier(estimators=[('mlp', classifiers[0]), ('rf', classifiers[1]), ('svm', classifiers[2])], voting='soft') curriculum_df = self.ensemble_hardness(X, y, estimator) curriculum_df.to_csv(path, index=False) else: print("Aborting generation") return
elif Estimator.__name__ == "Pipeline": return Estimator(steps=[("clf", LogisticRegression())]) elif Estimator.__name__ == "FeatureUnion": return Estimator(transformer_list=[("transformer", FunctionTransformer())]) def _construct_sparse_coder(Estimator): # XXX: hard-coded assumption that n_features=3 dictionary = np.array( [[0, 1, 0], [-1, -1, 2], [1, 1, 1], [0, 1, 1], [0, 2, 1]], dtype=np.float64, ) return Estimator(dictionary=dictionary) @pytest.mark.parametrize("name, Estimator", all_estimators()) def test_fit_docstring_attributes(name, Estimator): pytest.importorskip("numpydoc") from numpydoc import docscrape doc = docscrape.ClassDoc(Estimator) attributes = doc["Attributes"] if Estimator.__name__ in ( "HalvingRandomSearchCV", "RandomizedSearchCV", "HalvingGridSearchCV", "GridSearchCV", ): est = _construct_searchcv_instance(Estimator) elif Estimator.__name__ in (
dataset = load_wine() x = dataset.data y = dataset.target x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=44) import sklearn print(sklearn.__version__) # 0.23.2 # all_estimators -> 0.20 에 최적화되어있다. allAlgorithms = all_estimators(type_filter='classifier') # sklearn의 분류형 모델 전체 # print(allAlgorithms) for (name, algorithm) in allAlgorithms: try: model = algorithm() model.fit(x_train, y_train) y_pred = model.predict(x_test) print(name, '의 정답율 :', accuracy_score(y_test, y_pred)) except: # continue print(name, '은 없는 놈') # 0.23.2 에 없는 algorithm # 기준이 되는 지표로 삼을 수 있다. '''
def test_all_estimators_all_public(): # all_estimator should not fail when pytest is not installed and return # only public estimators estimators = all_estimators() for est in estimators: assert not est.__class__.__name__.startswith("_")