def test_equivalence_subsemble(): """[SequentialEnsemble] Test ensemble equivalence with Subsemble.""" ens = Subsemble(n_jobs=1) seq = SequentialEnsemble(n_jobs=1) ens.add(ECM, dtype=np.float64) seq.add('subsemble', ECM, dtype=np.float64) F = ens.fit(X, y).predict(X) P = seq.fit(X, y).predict(X) np.testing.assert_array_equal(P, F)
def test_equivalence_subsemble(): """[Sequential] Test ensemble equivalence with Subsemble.""" ens = Subsemble() seq = SequentialEnsemble() ens.add(ECM) seq.add('subset', ECM) F = ens.fit(X, y).predict(X) P = seq.fit(X, y).predict(X) np.testing.assert_array_equal(P, F)
def test_subset_equiv(): """[Subsemble] Test equivalence with SuperLearner for J=1.""" sub = Subsemble(partitions=1) sl = SuperLearner() sub.add(ECM, dtype=np.float64) sl.add(ECM, dtype=np.float64) F = sub.fit(X, y).predict(X) P = sl.fit(X, y).predict(X) np.testing.assert_array_equal(P, F)
def test_subset_fit(): """[Subsemble] 'fit' and 'predict' runs correctly.""" meta = OLS() meta.fit(F, y) g = meta.predict(P) ens = Subsemble() ens.add(ECM, partitions=2, folds=3, dtype=np.float64) ens.add_meta(OLS(), dtype=np.float64) ens.fit(X, y) pred = ens.predict(X) np.testing.assert_array_equal(pred, g)
def build_clustered_subsemble(estimator): """Build a subsemble with random partitions""" sub = Subsemble(partitions=2, partition_estimator=estimator, folds=2, verbose=2) sub.add([SVC(), LogisticRegression()]) sub.add_meta(SVC()) return sub
def add_subsemble(name, models, X_train, Y_train, X_test, Y_test): # Establish and reset variables acc_score_cv = None acc_score = None time_ = None ensemble = Subsemble(scorer=accuracy_score, random_state=seed) ensemble.add(models) # Attach the final meta estimator ensemble.add(SVC(), meta=True) start = time.time() ensemble.fit(X_train, Y_train) preds = ensemble.predict(X_test) acc_score = accuracy_score(preds, Y_test) end = time.time() time_ = end - start return { "Ensemble": name, "Meta_Classifier": "SVC", "Accuracy_Score": acc_score, "Runtime": time_ }
'machine-learning-databases/' 'poker/poker-hand-testing.data') else: raise ValueError("Not valid data option.") X = np.loadtxt(out, delimiter=",") y = X[:, -1] X = X[:, :-1] return X, y xtrain, ytrain = get_data('train') xtest, ytest = get_data('test') estimators = { 'subsemble': Subsemble(), 'super_learner': SuperLearner(), 'blend_ensemble': BlendEnsemble() } base_learners = [ RandomForestClassifier(n_estimators=500, max_depth=10, min_samples_split=50, max_features=0.6), LogisticRegression(C=1e5), GradientBoostingClassifier() ] for clf in estimators.values(): clf.add([RandomForestClassifier(), LogisticRegression(), MLPClassifier()])
def build_subsemble(): """Build a subsemble with random partitions""" sub = Subsemble(partitions=3, folds=2) sub.add([SVC(), LogisticRegression()]) return sub
def __init__(self): pass def our_custom_function(self, X, y=None): """Split the data in half based on the sum of features""" # Labels should be numerical return 1 * (X.sum(axis=1) > X.sum(axis=1).mean()) def get_params(self, deep=False): return {} # Note that the number of partitions the estimator creates *must* match the # ``partitions`` argument passed to the subsemble. sub = Subsemble(partitions=2, folds=3, verbose=1) sub.add([SVC(), LogisticRegression()], partition_estimator=SimplePartitioner(), fit_estimator=False, attr="our_custom_function") sub.fit(X, y) ############################################################################## # A final word of caution. When implementing custom estimators from scratch, some # care needs to be taken if you plan on copying the Subsemble. It is advised that # the estimator inherits the :class:`sklearn.base.BaseEstimator` class to # provide a Scikit-learn compatible interface. For further information, # see the :ref:`API` documentation of the :class:`Subsemble` # and :class:`mlens.base.indexer.ClusteredSubsetIndex`. #
# determine if we are building a classifier model classifier = np.all(np.unique(Y.to_numpy()) == [0, 1]) outputs = Y.shape[1] # separate the data into training and testing if TIME_SERIES: test_idx = X.index.values[-int(X.shape[0] / 5):] else: np.random.seed(1) test_idx = np.random.choice(a=X.index.values, size=int(X.shape[0] / 5), replace=False) train_idx = np.array(list(set(X.index.values) - set(test_idx))) # set up the model if classifier: model = Subsemble(partitions=2, random_state=42, n_jobs=1) model.add(KNeighborsClassifier()) model.add(RandomForestClassifier()) model.add(GaussianNB()) model.add_meta(LogisticRegression(penalty="l1", solver="saga")) else: model = Subsemble(partitions=2, random_state=42, n_jobs=1) model.add(KNeighborsRegressor()) model.add(RandomForestRegressor()) model.add(BayesianRidge()) model.add_meta(Lasso()) # train and predict train_predict = pd.DataFrame() test_predict = pd.DataFrame() for j in Y.columns: