def test_pipeline_add_at_start(): def a(x): pass pipeline = preprocessing.StandardScaler() | linear_model.LinearRegression() pipeline = a | pipeline assert str(pipeline) == 'a | StandardScaler | LinearRegression'
def test_one_many_consistent(): """Checks that using fit_one or fit_many produces the same result.""" X = pd.read_csv(datasets.TrumpApproval().path) Y = X.pop('five_thirty_eight') one = lm.LinearRegression() for x, y in stream.iter_pandas(X, Y): one.fit_one(x, y) many = lm.LinearRegression() for xb, yb in zip(np.array_split(X, len(X)), np.array_split(Y, len(Y))): many.fit_many(xb, yb) for i in X: assert math.isclose(one.weights[i], many.weights[i])
def _default_params(cls): return { 'regressors': [ pp.StandardScaler() | lm.LinearRegression(intercept_lr=.1), pp.StandardScaler() | lm.PARegressor(), ] }
def test_model_upload(client, app, regression): # Instantiate a model model = linear_model.LinearRegression() probe = uuid.uuid4() model.probe = probe # Upload the model r = client.post('/api/model/healthy-banana', data=pickle.dumps(model)) assert r.status_code == 201 assert r.json == {'name': 'healthy-banana'} # Check that the model has been added to the shelf with app.app_context(): shelf = storage.get_db() assert isinstance(shelf['models/healthy-banana'], linear_model.LinearRegression) assert shelf['models/healthy-banana'].probe == probe # Check that the model can be retrieved via the API with it's name model = pickle.loads(client.get('/api/model/healthy-banana').get_data()) assert isinstance(model, linear_model.LinearRegression) assert model.probe == probe # Check that the model can be retrieved via the API by default model = pickle.loads(client.get('/api/model').get_data()) assert isinstance(model, linear_model.LinearRegression) assert model.probe == probe
def main(): benchmark.benchmark( get_X_y=functools.partial(stream.iter_sklearn_dataset, datasets.load_boston()), n=506, get_pp=preprocessing.StandardScaler, models=[ ('creme', 'LinReg', linear_model.LinearRegression(optimizer=optim.VanillaSGD(0.01), l2=0.)), ('creme', 'GLM', linear_model.GLMRegressor(optimizer=optim.VanillaSGD(0.01), l2=0.)), ('creme', 'GLM detrend', meta.Detrender( linear_model.GLMRegressor(optimizer=optim.VanillaSGD(0.01), l2=0., intercept_lr=0.))), ('sklearn', 'SGD', compat.CremeRegressorWrapper( sklearn_estimator=sk_linear_model.SGDRegressor( learning_rate='constant', eta0=0.01, fit_intercept=True, penalty='none'), )), ], get_metric=metrics.MSE)
def test_shuffle_columns(): """Checks that fit_many works identically whether columns are shuffled or not.""" X = pd.read_csv(datasets.TrumpApproval().path) Y = X.pop('five_thirty_eight') normal = lm.LinearRegression() for xb, yb in zip(np.array_split(X, 10), np.array_split(Y, 10)): normal.fit_many(xb, yb) shuffled = lm.LinearRegression() for xb, yb in zip(np.array_split(X, 10), np.array_split(Y, 10)): cols = np.random.permutation(X.columns) shuffled.fit_many(xb[cols], yb) for i in X: assert math.isclose(normal.weights[i], shuffled.weights[i])
def test_models(client, app, regression): model = linear_model.LinearRegression() client.post('/api/model/ted-mosby', data=pickle.dumps(model)) client.post('/api/model/barney-stinson', data=pickle.dumps(model)) r = client.get('/api/models') assert r.json == {'default': 'barney-stinson', 'models': ['barney-stinson', 'ted-mosby']}
def test_set_params(): obj = linear_model.LinearRegression(l2=42) obj.fit_one({'x': 3}, 6) new = obj._set_params({'l2': 21}) assert new.l2 == 21 assert obj.l2 == 42 assert new.weights == {} assert new.weights != obj.weights
def test_set_params_pipeline(): obj = preprocessing.StandardScaler() | linear_model.LinearRegression(l2=42) obj.fit_one({'x': 3}, 6) params = {'LinearRegression': {'l2': 21}} new = obj._set_params(params) assert new['LinearRegression'].l2 == 21 assert obj['LinearRegression'].l2 == 42 assert new['LinearRegression'].weights == {} assert new['LinearRegression'].weights != obj['LinearRegression'].weights
def test_add_remove_columns(): """Checks that no exceptions are raised whenever columns are dropped and/or added.""" X = pd.read_csv(datasets.TrumpApproval().path) Y = X.pop('five_thirty_eight') lin_reg = lm.LinearRegression() for xb, yb in zip(np.array_split(X, 10), np.array_split(Y, 10)): # Pick half of the columns at random cols = np.random.choice(X.columns, len(X.columns) // 2, replace=False) lin_reg.fit_many(xb[cols], yb)
def test_delete_model(client, app, regression): # Upload a model model = linear_model.LinearRegression() client.post('/api/model/healthy-banana', data=pickle.dumps(model)) with app.app_context(): assert 'models/healthy-banana' in storage.get_db() # Delete it client.delete('/api/model/healthy-banana') with app.app_context(): assert 'models/healthy-banana' not in storage.get_db()
def __init__(self, p: int, d: int, q: int, m: int = 1, sp: int = 0, sd: int = 0, sq: int = 0, regressor: creme.base.Regressor = None): self.p = p self.d = d self.q = q self.m = m self.sp = sp self.sd = sd self.sq = sq self.regressor = ( regressor if regressor is not None else preprocessing.StandardScaler() | linear_model.LinearRegression() ) self.differencer = Differencer(d=d, m=1) + Differencer(d=sd, m=1) self.y_trues = collections.deque(maxlen=max(p, m * sp)) self.errors = collections.deque(maxlen=max(p, m * sq))
def main(): import datetime as dt from creme import compose from creme import datasets from creme import feature_extraction from creme import linear_model from creme import metrics as metricss from creme import preprocessing from creme import stats from creme import stream X_y = datasets.Bikes() X_y = stream.simulate_qa(X_y, moment='moment', delay=dt.timedelta(minutes=30)) def add_time_features(x): return {**x, 'hour': x['moment'].hour, 'day': x['moment'].weekday()} model = add_time_features model |= (compose.Select('clouds', 'humidity', 'pressure', 'temperature', 'wind') + feature_extraction.TargetAgg( by=['station', 'hour'], how=stats.Mean()) + feature_extraction.TargetAgg(by='station', how=stats.EWMean())) model |= preprocessing.StandardScaler() model |= linear_model.LinearRegression() metric = metricss.MAE() questions = {} for i, x, y in X_y: # Question is_question = y is None if is_question: y_pred = model.predict_one(x) questions[i] = y_pred # Answer else: metric.update(y, questions[i]) model = model.fit_one(x, y) if i >= 30000 and i % 30000 == 0: print(i, metric)
def test_lin_reg_sklearn_coherence(): """Checks that the sklearn and creme implementations produce the same results.""" class SquaredLoss: """sklearn removes the leading 2 from the gradient of the squared loss.""" def gradient(self, y_true, y_pred): return y_pred - y_true ss = preprocessing.StandardScaler() cr = lm.LinearRegression(optimizer=optim.SGD(.01), loss=SquaredLoss()) sk = sklm.SGDRegressor(learning_rate='constant', eta0=.01, alpha=.0) for x, y in datasets.TrumpApproval(): x = ss.fit_one(x).transform_one(x) cr.fit_one(x, y) sk.partial_fit([list(x.values())], [y]) for i, w in enumerate(cr.weights.values()): assert math.isclose(w, sk.coef_[i]) assert math.isclose(cr.intercept, sk.intercept_[0])
def main(): def add_hour(x): x['hour'] = x['moment'].hour return x benchmark.benchmark( get_X_y=datasets.fetch_bikes, n=182470, get_pp=lambda: (compose.Whitelister('clouds', 'humidity', 'pressure', 'temperature', 'wind') + (add_hour | feature_extraction.TargetAgg( by=['station', 'hour'], how=stats.Mean()) ) | preprocessing.StandardScaler()), models=[ ('creme', 'LinReg', linear_model.LinearRegression(optimizer=optim.VanillaSGD(0.01), l2=0.)), ('creme', 'GLM', linear_model.GLMRegressor(optimizer=optim.VanillaSGD(0.01), intercept_lr=0.01, l2=0)), ('sklearn', 'SGD', compat.CremeRegressorWrapper( sklearn_estimator=sk_linear_model.SGDRegressor( learning_rate='constant', eta0=0.01, fit_intercept=True, penalty='none'), )), ('creme', 'GLM no intercept', linear_model.GLMRegressor(optimizer=optim.VanillaSGD(0.01), intercept_lr=0, l2=0)), ('sklearn', 'SGD no intercept', compat.CremeRegressorWrapper( sklearn_estimator=sk_linear_model.SGDRegressor( learning_rate='constant', eta0=0.01, fit_intercept=False, penalty='none'), )), ], get_metric=metrics.MSE)
def test_add_model(app): runner = app.test_cli_runner() # Pickle a model model = linear_model.LinearRegression() probe = uuid.uuid4() model.probe = probe with open('tmp.pkl', 'wb') as f: pickle.dump(model, f) # Add the model to the shelf through the CLI result = runner.invoke(cli.add_model, ['tmp.pkl', '--name', 'banana']) assert result.exit_code == 0 # Check that the model has been added to the shelf with app.app_context(): db = storage.get_db() assert isinstance(db['models/banana'], linear_model.LinearRegression) assert db['models/banana'].probe == probe # Delete the pickle os.remove('tmp.pkl')
def test_model_no_flavor(client, app): model = linear_model.LinearRegression() r = client.post('/api/model', data=pickle.dumps(model)) assert r.status_code == 400 assert r.json == {'message': 'No flavor has been set.'}
def lin_reg(client): model = preprocessing.StandardScaler() | linear_model.LinearRegression() client.post('/api/model/lin-reg', data=pickle.dumps(model))
import pytest from sklearn.utils import estimator_checks from sklearn import linear_model as sk_linear_model from creme import base from creme import cluster from creme import compat from creme import linear_model from creme import preprocessing @pytest.mark.parametrize('estimator', [ pytest.param(estimator, id=str(estimator)) for estimator in [ linear_model.LinearRegression(), linear_model.LogisticRegression(), preprocessing.StandardScaler(), cluster.KMeans(seed=42) ] ]) def test_creme_to_sklearn_check_estimator(estimator: base.Estimator): skl_estimator = compat.convert_creme_to_sklearn(estimator) estimator_checks.check_estimator(skl_estimator) def test_sklearn_check_twoway(): estimator = sk_linear_model.SGDRegressor() creme_estimator = compat.convert_sklearn_to_creme(estimator) skl_estimator = compat.convert_creme_to_sklearn(creme_estimator) estimator_checks.check_estimator(skl_estimator)
def _default_params(cls): return {'model': linear_model.LinearRegression()}
def get_all_estimators(): ignored = (CremeBaseWrapper, SKLBaseWrapper, base.Wrapper, compose.FuncTransformer, ensemble.StackingBinaryClassifier, feature_extraction.Agg, feature_extraction.TargetAgg, feature_extraction.Differ, linear_model.FMRegressor, linear_model.SoftmaxRegression, multioutput.ClassifierChain, multioutput.RegressorChain, naive_bayes.BernoulliNB, naive_bayes.ComplementNB, preprocessing.OneHotEncoder, tree.DecisionTreeClassifier) def is_estimator(obj): return inspect.isclass(obj) and issubclass(obj, base.Estimator) for submodule in importlib.import_module('creme').__all__: if submodule == 'base': continue for name, obj in inspect.getmembers( importlib.import_module(f'creme.{submodule}'), is_estimator): if issubclass(obj, ignored): continue if issubclass(obj, dummy.StatisticRegressor): inst = obj(statistic=stats.Mean()) elif issubclass(obj, ensemble.BaggingClassifier): inst = obj(linear_model.LogisticRegression()) elif issubclass(obj, ensemble.BaggingRegressor): inst = obj(linear_model.LinearRegression()) elif issubclass(obj, ensemble.HedgeRegressor): inst = obj([ preprocessing.StandardScaler() | linear_model.LinearRegression(intercept_lr=0.1), preprocessing.StandardScaler() | linear_model.PARegressor(), ]) elif issubclass(obj, feature_selection.RandomDiscarder): inst = obj(n_to_keep=5) elif issubclass(obj, feature_selection.SelectKBest): inst = obj(similarity=stats.PearsonCorrelation()) elif issubclass(obj, linear_model.LinearRegression): inst = preprocessing.StandardScaler() | obj(intercept_lr=0.1) elif issubclass(obj, linear_model.PARegressor): inst = preprocessing.StandardScaler() | obj() elif issubclass(obj, multiclass.OneVsRestClassifier): inst = obj(binary_classifier=linear_model.LogisticRegression()) else: inst = obj() yield inst
elif (opt == "Adam"): optimizer = optim.Adam(lr, beta_1, beta_2, eps) elif (opt == "FTRLProximal"): optimizer = optim.FTRLProximal(alpha, beta, l1, l2) elif (opt == "Momentum"): optimizer = optim.Momentum(lr, rho) elif (opt == "RMSProp"): optimizer = optim.RMSProp(lr, rho, eps) elif (opt == "VanillaSGD"): optimizer = optim.VanillaSGD(lr) elif (opt == "NesterovMomentum"): optimizer = optim.NesterovMomentum(lr, rho) else: optimizer = None lin_reg = linear_model.LinearRegression(optimizer, l2= l2) output = {} while True: #wait request data = input() Xi = json.loads(data) y = float(Xi.pop(target)) output["Predict"] = lin_reg.predict_one(Xi) output["Truth"] = y model = lin_reg.fit_one(Xi, y)
def get_all_estimators(): ignored = (Creme2SKLBase, SKL2CremeBase, compat.PyTorch2CremeRegressor, compose.FuncTransformer, compose.Pipeline, ensemble.StackingBinaryClassifier, feature_extraction.Agg, feature_extraction.TargetAgg, feature_extraction.Differ, feature_selection.PoissonInclusion, imblearn.RandomOverSampler, imblearn.RandomUnderSampler, imblearn.RandomSampler, impute.PreviousImputer, impute.StatImputer, linear_model.FFMClassifier, linear_model.FFMRegressor, linear_model.FMClassifier, linear_model.FMRegressor, linear_model.HOFMClassifier, linear_model.HOFMRegressor, linear_model.SoftmaxRegression, meta.PredClipper, meta.TransformedTargetRegressor, multioutput.ClassifierChain, multioutput.RegressorChain, preprocessing.OneHotEncoder, reco.Baseline, reco.BiasedMF, reco.FunkMF, reco.RandomNormal, time_series.Detrender, time_series.GroupDetrender, time_series.SNARIMAX) def is_estimator(obj): return inspect.isclass(obj) and issubclass(obj, base.Estimator) for submodule in importlib.import_module('creme').__all__: if submodule == 'base': continue for _, obj in inspect.getmembers( importlib.import_module(f'creme.{submodule}'), is_estimator): if issubclass(obj, ignored): continue elif issubclass(obj, dummy.StatisticRegressor): inst = obj(statistic=stats.Mean()) elif issubclass(obj, meta.BoxCoxRegressor): inst = obj(regressor=linear_model.LinearRegression()) elif issubclass(obj, tree.RandomForestClassifier): inst = obj() elif issubclass(obj, ensemble.BaggingClassifier): inst = obj(linear_model.LogisticRegression()) elif issubclass(obj, ensemble.BaggingRegressor): inst = obj(linear_model.LinearRegression()) elif issubclass(obj, ensemble.AdaBoostClassifier): inst = obj(linear_model.LogisticRegression()) elif issubclass(obj, ensemble.HedgeRegressor): inst = obj([ preprocessing.StandardScaler() | linear_model.LinearRegression(intercept_lr=.1), preprocessing.StandardScaler() | linear_model.PARegressor(), ]) elif issubclass(obj, feature_selection.SelectKBest): inst = obj(similarity=stats.PearsonCorrelation()) elif issubclass(obj, linear_model.LinearRegression): inst = preprocessing.StandardScaler() | obj(intercept_lr=.1) elif issubclass(obj, linear_model.PARegressor): inst = preprocessing.StandardScaler() | obj() elif issubclass(obj, multiclass.OneVsRestClassifier): inst = obj(binary_classifier=linear_model.LogisticRegression()) else: inst = obj() yield inst
'total_mastery_points_ratio': total_points_ratio, 'rank_ratio': rank_ratio } MODELS = { 'v0': (compose.FuncTransformer(process_match) | compose.TransformerUnion([ compose.Whitelister( 'champion_mastery_points_ratio', 'total_mastery_points_ratio', 'rank_ratio', ), preprocessing.OneHotEncoder('mode', sparse=False), preprocessing.OneHotEncoder('type', sparse=False) ]) | preprocessing.StandardScaler() | linear_model.LinearRegression(optim.VanillaSGD(0.005))) } class Command(base.BaseCommand): def handle(self, *args, **options): print(f'Adding models with creme version {creme.__version__}') for name, pipeline in MODELS.items(): if models.CremeModel.objects.filter(name=name).exists(): print(f'\t{name} has already been added') continue models.CremeModel(name=name, pipeline=pipeline).save()
def test_check_estimator(): model = linear_model.LinearRegression() skl_model = compat.convert_creme_to_sklearn(model) estimator_checks.check_estimator(skl_model)
def __init__(self, data_collector): dc = data_collector data = dc.get_data_frame() metric = metrics.MAE() # delete NA examples data = data.dropna() # shuffle data X_y = data.sample(frac=1).reset_index(drop=True) data = X_y[['x', 'y', 'theta']].to_dict('records') target_1 = X_y[['sensor_1']] target_2 = X_y[['sensor_3']] target_3 = X_y[['sensor_5']] target_4 = X_y[['sensor_7']] print('constructing models') # construct our pipeline model_1 = Pipeline([ ("scale", StandardScaler()), ("learn", ensemble.HedgeRegressor([ linear_model.LinearRegression(optim.SGD()), linear_model.LinearRegression(optim.RMSProp()), linear_model.LinearRegression(optim.Adam()) ])) ]) # construct our pipeline model_2 = Pipeline([ ("scale", StandardScaler()), ("learn", ensemble.HedgeRegressor([ linear_model.LinearRegression(optim.SGD()), linear_model.LinearRegression(optim.RMSProp()), linear_model.LinearRegression(optim.Adam()) ])) ]) # construct our pipeline model_3 = Pipeline([ ("scale", StandardScaler()), ("learn", ensemble.HedgeRegressor([ linear_model.LinearRegression(optim.SGD()), linear_model.LinearRegression(optim.RMSProp()), linear_model.LinearRegression(optim.Adam()) ])) ]) # construct our pipeline model_4 = Pipeline([ ("scale", StandardScaler()), ("learn", ensemble.HedgeRegressor([ linear_model.LinearRegression(optim.SGD()), linear_model.LinearRegression(optim.RMSProp()), linear_model.LinearRegression(optim.Adam()) ])) ]) print('start training') for x, y_1, y_2, y_3, y_4 in zip( data, target_1.values, target_2.values, target_3.values, target_4.values, ): model_1, y_pred_1 = self._update_model(model_1, x, y_1) model_2, y_pred_2 = self._update_model(model_2, x, y_2) model_3, y_pred_3 = self._update_model(model_3, x, y_3) model_4, y_pred_4 = self._update_model(model_4, x, y_4) self.models = [model_1, model_2, model_3, model_4] print('done...')
elif issubclass(obj, multiclass.OneVsRestClassifier): inst = obj(binary_classifier=linear_model.LogisticRegression()) else: inst = obj() yield inst @pytest.mark.parametrize('estimator, check', [ pytest.param( copy.deepcopy(estimator), check, id=f'{estimator}:{check.__name__}') for estimator in list(get_all_estimators()) + [ feature_extraction.TFIDF(), linear_model.LogisticRegression(), preprocessing.StandardScaler() | linear_model.LinearRegression(), preprocessing.StandardScaler() | linear_model.PAClassifier(), preprocessing.StandardScaler() | multiclass.OneVsRestClassifier(linear_model.LogisticRegression()), preprocessing.StandardScaler() | multiclass.OneVsRestClassifier(linear_model.PAClassifier()), naive_bayes.GaussianNB(), preprocessing.StandardScaler(), cluster.KMeans(n_clusters=5, seed=42), preprocessing.MinMaxScaler(), preprocessing.MinMaxScaler() + preprocessing.StandardScaler(), preprocessing.PolynomialExtender(), feature_selection.VarianceThreshold(), feature_selection.SelectKBest(similarity=stats.PearsonCorrelation()) ] for check in utils.estimator_checks.yield_checks(estimator) ])
import pytest from creme import compose from creme import linear_model from creme import optim from creme import preprocessing from creme import tree from creme import utils @pytest.mark.parametrize('model, param_grid, count', [ (linear_model.LinearRegression(), { 'optimizer': [(optim.SGD, { 'lr': [1, 2] }), (optim.Adam, { 'beta_1': [.1, .01, .001], 'lr': [.1, .01, .001, .0001] })] }, 2 + 3 * 4), (preprocessing.StandardScaler() | linear_model.LinearRegression(), { 'LinearRegression': { 'optimizer': [(optim.SGD, { 'lr': [1, 2] }), (optim.Adam, { 'beta_1': [.1, .01, .001], 'lr': [.1, .01, .001, .0001] })] } }, 2 + 3 * 4),
} def add_intercept(x): return {**x, 'intercept': 1.} for name, (creme_optim, torch_optim, keras_optim) in OPTIMIZERS.items(): X_y = stream.iter_sklearn_dataset(dataset=datasets.load_boston(), shuffle=True, random_state=42) n_features = 13 creme_lin_reg = (compose.FuncTransformer(add_intercept) | linear_model.LinearRegression( optimizer=creme_optim, l2=0, intercept_lr=0)) torch_model = PyTorchNet(n_features=n_features) torch_lin_reg = PyTorchRegressor(network=torch_model, loss_fn=torch.nn.MSELoss(), optimizer=torch_optim( torch_model.parameters())) inputs = layers.Input(shape=(n_features, )) predictions = layers.Dense(1, kernel_initializer='zeros', bias_initializer='zeros')(inputs) keras_model = models.Model(inputs=inputs, outputs=predictions) keras_model.compile(optimizer=keras_optim, loss='mean_squared_error') keras_lin_reg = KerasRegressor(keras_model)
def test_linear_regression(): estimator = compat.SKLRegressorWrapper(linear_model.LinearRegression()) estimator_checks.check_estimator(estimator)