def test_bagging(): X, y = load_iris(return_X_y=True) X_ = X.tolist() for y_ in [y, (y == 0).astype(int), (y == 2).astype(int)]: for n_estimators in [1, 10]: for max_depth in [5, 10, None]: for max_features in [0.75, 1.0]: dt = DecisionTreeClassifier(max_depth=max_depth, random_state=5) clf = BaggingClassifier( dt, bootstrap=False, n_estimators=n_estimators, random_state=5, max_features=max_features, ) clf.fit(X, y_) clf_ = convert_estimator(clf) for method in METHODS: with warnings.catch_warnings(): warnings.simplefilter("ignore") scores = getattr(clf, method)(X) scores_ = getattr(clf_, method)(X_) assert np.allclose(scores.shape, shape(scores_)) assert np.allclose(scores, scores_, equal_nan=True)
def test_dict_vectorizer(): dv = DictVectorizer() dv.fit(X) dv_ = convert_estimator(dv) dv_t = dv.transform(X) dv_t_ = dv_.transform(X) assert np.allclose(dv_t.toarray(), dv_t_.todense())
def test_max_abs_scaler(): tform = MaxAbsScaler() tform.fit(X) tform_ = convert_estimator(tform) X_t = tform.transform(X) X_t_ = tform_.transform(X) np.allclose(X_t, X_t_)
def test_dict_vectorizer_dense(): dv = DictVectorizer(sparse=False) dv.fit(X) dv_ = convert_estimator(dv) dv_t = dv.transform(X) dv_t_ = dv_.transform(X) assert np.allclose(dv_t, dv_t_)
def test_normalizer(): for norm in ["l1", "l2", "max"]: tform = Normalizer(norm=norm) tform.fit(X) tform_ = convert_estimator(tform) X_t = tform.transform(X) X_t_ = tform_.transform(X) np.allclose(X_t, X_t_)
def test_max_abs_scaler_sparse(): X_sparse = tosparse(X) tform = MaxAbsScaler() tform.fit(X) tform_ = convert_estimator(tform) X_t = tform.transform(X) X_t_ = tform_.transform(X_sparse) np.allclose(X_t, X_t_.todense())
def test_min_max_scaler(): for feature_range in [(0, 1), (1, 2), (-1, 1)]: tform = MinMaxScaler(feature_range=feature_range) tform.fit(X) tform_ = convert_estimator(tform) X_t = tform.transform(X) X_t_ = tform_.transform(X) np.allclose(X_t, X_t_)
def test_tfidf_vectorizer(): for norm in ["l1", "l2", None]: vec = TfidfVectorizer(norm=norm) vec.fit(X) vec_ = convert_estimator(vec) assert np.allclose( vec.transform(X).toarray(), vec_.transform(X).todense())
def test_onehotencoder(): X0 = [["Male", 1], ["Female", 3], ["Female", 2]] X1 = [["Male", 1], ["Female", 27], ["Bananas", 2]] for X in [X0, X1]: ohe = OneHotEncoder(handle_unknown="ignore") ohe.fit(X) ohe_ = convert_estimator(ohe) assert np.allclose(ohe.transform(X).toarray(), ohe_.transform(X).todense())
def test_ordinalencoder(): X0 = [["Male", 1], ["Female", 3], ["Female", 2]] X1 = [["Male", 1], ["Female", 27], ["Bananas", 2]] for X in [X0, X1]: ohe = OrdinalEncoder() ohe.fit(X) ohe_ = convert_estimator(ohe) assert np.allclose(ohe.transform(X), ohe_.transform(X))
def convert_to_pure_predict(): #import pickle #from sklearn.ensemble import RandomForestClassifier classifier_path = "friend_rating_classifier.pkl" classifier = db_functions.load_pickle(classifier_path) clf_pure_predict = convert_estimator(classifier) db_functions.save_pickle(clf_pure_predict, "friend_rating_classifier_pure_predict.pkl")
def test_standard_scaler_sparse(): X_sparse = tosparse(X) for with_std in [True, False]: tform = StandardScaler(with_mean=False, with_std=with_std) tform.fit(X) tform_ = convert_estimator(tform) X_t = tform.transform(X) X_t_ = tform_.transform(X_sparse) np.allclose(X_t, X_t_.todense())
def test_normalizer_sparse(): X_sparse = tosparse(X) for norm in ["l1", "l2", "max"]: tform = Normalizer(norm=norm) tform.fit(X) tform_ = convert_estimator(tform) X_t = tform.transform(X) X_t_ = tform_.transform(X_sparse) np.allclose(X_t, X_t_.todense())
def test_standard_scaler(): for with_mean in [True, False]: for with_std in [True, False]: tform = StandardScaler(with_mean=with_mean, with_std=with_std) tform.fit(X) tform_ = convert_estimator(tform) X_t = tform.transform(X) X_t_ = tform_.transform(X) np.allclose(X_t, X_t_)
def test_feature_union_sparse(): X, y = load_iris(return_X_y=True) X_ = tosparse(X.tolist()) union = FeatureUnion([("ss", StandardScaler(with_mean=False)), ("mms", MaxAbsScaler())]) union.fit(X, y) union_ = convert_estimator(union) assert np.allclose(union.transform(X), union_.transform(X_).todense())
def test_hashing_vectorizer(): for norm in ["l1", "l2", None]: vec = HashingVectorizer(n_features=2**8, norm=norm) vec.fit(X) vec_ = convert_estimator(vec) X_t = vec.transform(X) X_t_ = vec_.transform(X) assert np.allclose( vec.transform(X).toarray(), vec_.transform(X).todense())
def test_feature_union(): X, y = load_iris(return_X_y=True) X_ = X.tolist() union = FeatureUnion([ ("imp_mean", SimpleImputer(strategy="mean")), ("imp_median", SimpleImputer(strategy="median")), ]) union.fit(X, y) union_ = convert_estimator(union) assert np.allclose(union.transform(X), union_.transform(X.tolist()))
def test_pipeline(): X, y = load_iris(return_X_y=True) X_ = X.tolist() lr = LogisticRegression(solver="lbfgs", multi_class="auto", max_iter=1000) pipe = Pipeline(steps=[ ("imp", SimpleImputer()), ("lr", lr), ]) pipe.fit(X, y) pipe_ = convert_estimator(pipe) assert np.allclose(pipe.predict_proba(X), pipe_.predict_proba(X.tolist()))
def test_dummy(): X, y = load_iris(return_X_y=True) X_ = X.tolist() for y_ in [y, (y == 0).astype(int), (y == 2).astype(int)]: clf = DummyClassifier(strategy="prior") clf.fit(X, y_) clf_ = convert_estimator(clf) for method in METHODS: scores = getattr(clf, method)(X) scores_ = getattr(clf_, method)(X_) assert np.allclose(scores.shape, shape(scores_)) assert np.allclose(scores, scores_, equal_nan=True)
def load(cls, vectorizer_folder, realtime=False): """ Load a saved object. Parameters ---------- vectorizer_folder: str Folder to load the model from realtime: bool if true then it is loaded for realtime inference """ with open(pathlib.Path(vectorizer_folder, "vectorizer_query.pkl"), "rb") as pfile: model_query = pickle.load(pfile) with open(pathlib.Path(vectorizer_folder, "vectorizer_prefix.pkl"), "rb") as pfile: model_prefix = pickle.load(pfile) with open(pathlib.Path(vectorizer_folder, "delim.json"), "r") as jfile: delim = json.load(jfile)["delim"] try: with open(pathlib.Path(vectorizer_folder, "max_prefix_len.json"), "r") as jfile: max_prefix_len = json.load(jfile)["max_prefix_len"] except Exception: LOGGER.warning( "max_prefix_len.json file not found. Max Prefix Len set to null" ) max_prefix_len = None if realtime: model_query = convert_estimator(model_query) model_prefix = convert_estimator( model_prefix) # convert to predict only faster version return cls( model_query=model_query, model_prefix=model_prefix, delim=delim, max_prefix_len=max_prefix_len, )
def test_complement(): X, y = load_iris(return_X_y=True) X_ = X.tolist() for y_ in [y, (y == 0).astype(int), (y == 2).astype(int)]: clf = ComplementNB() clf.fit(X, y_) clf_ = convert_estimator(clf) for method in METHODS: scores = getattr(clf, method)(X) scores_ = getattr(clf_, method)(X_) assert np.allclose(scores.shape, shape(scores_)) assert np.allclose(scores, scores_, equal_nan=True)
def test_ridge(): X, y = load_iris(return_X_y=True) X_ = X.tolist() for y_ in [y, (y == 0).astype(int), (y == 2).astype(int)]: for fit_intercept in [True, False]: clf = RidgeClassifier(fit_intercept=fit_intercept) clf.fit(X, y_) clf_ = convert_estimator(clf) for method in METHODS: scores = getattr(clf, method)(X) scores_ = getattr(clf_, method)(X_) assert np.allclose(scores.shape, shape(scores_)) assert np.allclose(scores, scores_)
def convert_classifier(path, origin: str): # convert to pure python estimator dir_path = os.path.dirname(path) filename = os.path.basename(path) filename, _ = filename.split(".") print("Loading classifier...") if origin.lower() == 'simba': clf = load_classifier_SIMBA(path) clf_pure_predict = convert_estimator(clf) with open(dir_path + "/" + filename + "_pure.sav", "wb") as f: pickle.dump(clf_pure_predict, f) elif origin.lower() == 'bsoid': clf_pack = load_classifier_BSOID(path) # bsoid exported classfier has format [a, b, c, clf, d, e] clf_pure_predict = convert_estimator(clf_pack[3]) clf_pack[3] = clf_pure_predict with open(dir_path + "/" + filename + "_pure.sav", "wb") as f: joblib.dump(clf_pack, f) else: raise ValueError(f'{origin} is not a valid classifier origin.') print(f"Converted Classifier {filename}")
def test_extra_tree_clf(): X, y = load_iris(return_X_y=True) X_ = X.tolist() for y_ in [y, (y == 0).astype(int), (y == 2).astype(int)]: for max_depth in [5, 10, None]: clf = ExtraTreeClassifier() clf.fit(X, y_) clf_ = convert_estimator(clf) for method in METHODS: with warnings.catch_warnings(): warnings.simplefilter("ignore") scores = getattr(clf, method)(X) scores_ = getattr(clf_, method)(X_) assert np.allclose(scores.shape, shape(scores_)) assert np.allclose(scores, scores_, equal_nan=True)
def test_extra_tree_reg(): X, y = load_iris(return_X_y=True) X_ = X.tolist() for y_ in [(y == 0).astype(int), (y == 2).astype(int)]: for max_depth in [5, 10, None]: clf = ExtraTreeRegressor(max_depth=max_depth, random_state=5) clf.fit(X, y_) clf_ = convert_estimator(clf) for method in ["predict"]: with warnings.catch_warnings(): warnings.simplefilter("ignore") scores = getattr(clf, method)(X) scores_ = getattr(clf_, method)(X_) assert np.allclose(scores.shape, shape(scores_)) assert np.allclose(scores, scores_, equal_nan=True)
def test_decision_tree_clf(): X, y = load_iris(return_X_y=True) X_ = X.tolist() X_sparse = tosparse(X_) for y_ in [y, (y == 0).astype(int), (y == 2).astype(int)]: for max_depth in [5, 10, None]: clf = DecisionTreeClassifier(max_depth=max_depth, random_state=5) clf.fit(X, y_) clf_ = convert_estimator(clf) for method in METHODS: with warnings.catch_warnings(): warnings.simplefilter("ignore") scores = getattr(clf, method)(X) scores_ = getattr(clf_, method)(X_) scores_sparse = getattr(clf_, method)(X_sparse) assert np.allclose(scores, scores_, equal_nan=True) assert np.allclose(scores, scores_sparse, equal_nan=True)
def test_missing_indicator(): X, y = load_iris(return_X_y=True) for missing_values in [np.nan, X[0][0], X[-1][1]]: X, y = load_iris(return_X_y=True) if np.isnan(missing_values): X.ravel()[np.random.choice(X.size, 20, replace=False)] = np.nan X_ = X.tolist() for features in ["missing-only", "all"]: imp = MissingIndicator( features=features, missing_values=missing_values, error_on_new=False ) imp.fit(X) imp_ = convert_estimator(imp) X_t = getattr(imp, "transform")(X) X_t_ = getattr(imp_, "transform")(X_) assert np.allclose(X_t.shape, shape(X_t_)) assert np.allclose(X_t, X_t_)
def test_xgboost(): X, y = load_iris(return_X_y=True) X_ = X.tolist() for y_ in [y, (y == 0).astype(int), (y == 2).astype(int)]: for n_estimators in [2, 10]: for max_depth in [3, 10]: clf = XGBClassifier( booster="gbtree", random_state=5, n_estimators=n_estimators, max_depth=max_depth, ) clf.fit(X, y_) clf_ = convert_estimator(clf) for method in METHODS: scores = getattr(clf, method)(X) scores_ = getattr(clf_, method)(X_) assert np.allclose(scores, scores_, equal_nan=True)
def test_sgd(): X, y = load_iris(return_X_y=True) X_ = X.tolist() for y_ in [y, (y == 0).astype(int), (y == 2).astype(int)]: for loss in LOSSES: for fit_intercept in [True, False]: clf = SGDClassifier(fit_intercept=fit_intercept, max_iter=MAX_ITER, tol=TOL, loss=loss) clf.fit(X, y_) clf_ = convert_estimator(clf) for method in METHODS: if hasattr(clf, method) and hasattr(clf_, method): with warnings.catch_warnings(): warnings.simplefilter("ignore") scores = getattr(clf, method)(X) scores_ = getattr(clf_, method)(X_) assert np.allclose(scores.shape, shape(scores_)) assert np.allclose(scores, scores_, equal_nan=True)
def test_logistic(): X, y = load_iris(return_X_y=True) X_ = X.tolist() X_sparse = tosparse(X_) for y_ in [y, (y == 0).astype(int), (y == 2).astype(int)]: for multi_class in ["ovr", "multinomial"]: for fit_intercept in [True, False]: clf = LogisticRegression( solver=SOLVER, multi_class=multi_class, fit_intercept=fit_intercept, max_iter=MAX_ITER, ) clf.fit(X, y_) clf_ = convert_estimator(clf) for method in METHODS: scores = getattr(clf, method)(X) scores_ = getattr(clf_, method)(X_) scores_sparse = getattr(clf_, method)(X_sparse) assert np.allclose(scores, scores_) assert np.allclose(scores, scores_sparse, equal_nan=True)