def _unit_test_params(cls): yield { "models": [ compose.Pipeline( preprocessing.StandardScaler(), linear_model.LinearRegression(optimizer=optim.SGD( lr=1e-2)), ), compose.Pipeline( preprocessing.StandardScaler(), linear_model.LinearRegression(optimizer=optim.SGD( lr=1e-1)), ), ], "metric": metrics.MAE(), } yield { "models": [ compose.Pipeline( preprocessing.StandardScaler(), linear_model.LinearRegression(optimizer=optim.SGD(lr=lr)), ) for lr in [1e-4, 1e-3, 1e-2, 1e-1] ], "metric": metrics.MAE(), }
def __init__( self, p: int, d: int, q: int, m: int = 1, sp: int = 0, sd: int = 0, sq: int = 0, regressor: base.Regressor = None, ): self.p = p self.d = d self.q = q self.m = m self.sp = sp self.sd = sd self.sq = sq self.regressor = ( regressor if regressor is not None else preprocessing.StandardScaler() | linear_model.LinearRegression() ) self.differencer = Differencer(d=d, m=1) + Differencer(d=sd, m=1) self.y_trues = collections.deque(maxlen=max(p, m * sp)) self.errors = collections.deque(maxlen=max(p, m * sq))
def _default_params(cls): return { 'regressors': [ pp.StandardScaler() | lm.LinearRegression(intercept_lr=.1), pp.StandardScaler() | lm.PARegressor(), ] }
def _unit_test_params(cls): return { "models": [ pp.StandardScaler() | lm.LinearRegression(intercept_lr=0.1), pp.StandardScaler() | lm.PARegressor(), ] }
def test_pipeline_add_at_start(): def a(x): pass pipeline = preprocessing.StandardScaler() | linear_model.LinearRegression() pipeline = a | pipeline assert str(pipeline) == "a | StandardScaler | LinearRegression"
def test_one_many_consistent(): """Checks that using learn_one or learn_many produces the same result.""" X = pd.read_csv(datasets.TrumpApproval().path) Y = X.pop('five_thirty_eight') one = lm.LinearRegression() for x, y in stream.iter_pandas(X, Y): one.learn_one(x, y) many = lm.LinearRegression() for xb, yb in zip(np.array_split(X, len(X)), np.array_split(Y, len(Y))): many.learn_many(xb, yb) for i in X: assert math.isclose(one.weights[i], many.weights[i])
def _unit_test_params(cls): return { "models": [ compose.Pipeline( preprocessing.StandardScaler(), linear_model.LinearRegression(optimizer=optim.SGD( lr=0.01)), ), compose.Pipeline( preprocessing.StandardScaler(), linear_model.LinearRegression(optimizer=optim.SGD(lr=0.1)), ), ], "metric": metrics.MAE(), }
def __init__( self, grace_period: int = 200, max_depth: int = None, split_confidence: float = 1e-7, tie_threshold: float = 0.05, leaf_prediction: str = "model", leaf_model: base.Regressor = None, model_selector_decay: float = 0.95, nominal_attributes: list = None, attr_obs: str = "e-bst", attr_obs_params: dict = None, min_samples_split: int = 5, **kwargs, ): super().__init__(max_depth=max_depth, **kwargs) self._split_criterion: str = "vr" self.grace_period = grace_period self.split_confidence = split_confidence self.tie_threshold = tie_threshold self.leaf_prediction = leaf_prediction self.leaf_model = leaf_model if leaf_model else linear_model.LinearRegression( ) self.model_selector_decay = model_selector_decay self.nominal_attributes = nominal_attributes self.min_samples_split = min_samples_split if attr_obs not in self._VALID_AO: raise AttributeError( f'Invalid "attr_obs" option. Valid options are: {self._VALID_AO}' ) self.attr_obs = attr_obs self.attr_obs_params = attr_obs_params if attr_obs_params is not None else {} self.kwargs = kwargs
def get_model(): extract_features = compose.TransformerUnion( get_ordinal_date, get_day_distances ) model = ( extract_features | time_series.SNARIMAX( p=0, d=0, q=0, m=7, sp=3, sq=0, regressor=( preprocessing.StandardScaler() | linear_model.LinearRegression( intercept_init=0, intercept_lr=0.3, optimizer=optim.SGD(0.01) ) ) ) ) return model
def test_shuffle_columns(): """Checks that learn_many works identically whether columns are shuffled or not.""" X = pd.read_csv(datasets.TrumpApproval().path) Y = X.pop('five_thirty_eight') normal = lm.LinearRegression() for xb, yb in zip(np.array_split(X, 10), np.array_split(Y, 10)): normal.learn_many(xb, yb) shuffled = lm.LinearRegression() for xb, yb in zip(np.array_split(X, 10), np.array_split(Y, 10)): cols = np.random.permutation(X.columns) shuffled.learn_many(xb[cols], yb) for i in X: assert math.isclose(normal.weights[i], shuffled.weights[i])
def test_set_params(): obj = linear_model.LinearRegression(l2=42) obj.learn_one({'x': 3}, 6) new = obj._set_params({'l2': 21}) assert new.l2 == 21 assert obj.l2 == 42 assert new.weights == {} assert new.weights != obj.weights
def test_set_params_pipeline(): obj = preprocessing.StandardScaler() | linear_model.LinearRegression(l2=42) obj.learn_one({'x': 3}, 6) new = obj._set_params({'LinearRegression': {'l2': 21}}) assert new['LinearRegression'].l2 == 21 assert obj['LinearRegression'].l2 == 42 assert new['LinearRegression'].weights == {} assert new['LinearRegression'].weights != obj['LinearRegression'].weights
def _unit_test_params(cls): import math from river import linear_model yield { "regressor": linear_model.LinearRegression(), "y_min": -math.inf, "y_max": math.inf, }
def test_add_remove_columns(): """Checks that no exceptions are raised whenever columns are dropped and/or added.""" X = pd.read_csv(datasets.TrumpApproval().path) Y = X.pop('five_thirty_eight') lin_reg = lm.LinearRegression() for xb, yb in zip(np.array_split(X, 10), np.array_split(Y, 10)): # Pick half of the columns at random cols = np.random.choice(X.columns, len(X.columns) // 2, replace=False) lin_reg.learn_many(xb[cols], yb)
def test_set_params_pipeline(): obj = preprocessing.StandardScaler() | linear_model.LinearRegression(l2=42) obj.learn_one({"x": 3}, 6) params = {"LinearRegression": {"l2": 21}} new = obj._set_params(params) assert new["LinearRegression"].l2 == 21 assert obj["LinearRegression"].l2 == 42 assert new["LinearRegression"].weights == {} assert new["LinearRegression"].weights != obj["LinearRegression"].weights
def __init__( self, grace_period: int = 200, max_depth: int = None, split_confidence: float = 1e-7, tie_threshold: float = 0.05, leaf_prediction: str = "model", leaf_model: base.Regressor = None, model_selector_decay: float = 0.95, nominal_attributes: list = None, attr_obs: str = "e-bst", attr_obs_params: dict = None, min_samples_split: int = 5, **kwargs, ): super().__init__(max_depth=max_depth, **kwargs) self._split_criterion: str = "vr" self.grace_period = grace_period self.split_confidence = split_confidence self.tie_threshold = tie_threshold self.leaf_prediction = leaf_prediction self.leaf_model = leaf_model if leaf_model else linear_model.LinearRegression() self.model_selector_decay = model_selector_decay self.nominal_attributes = nominal_attributes self.min_samples_split = min_samples_split if attr_obs not in self._VALID_AO: raise AttributeError( f'Invalid "attr_obs" option. Valid options are: {self._VALID_AO}' ) self.attr_obs = attr_obs self.attr_obs_params = attr_obs_params if attr_obs_params is not None else {} self.kwargs = kwargs if self.attr_obs == self._QO: self._qo_std_div = 3 if "std_div" in self.attr_obs_params: # Make sure the passed std_div value is valid if ( self.attr_obs_params["std_div"] is None or self.attr_obs_params["std_div"] > 0 ): self._qo_std_div = self.attr_obs_params["std_div"] if self._qo_std_div: # Dynamically evolving radii will be used self._feat_var = {} self._qo_radii = defaultdict(dict) else: # Static values will be used if "radius" in self.attr_obs_params: self._qo_radii = {"radius": self.attr_obs_params["radius"]} else: self._qo_radii = {}
def test_lin_reg_sklearn_coherence(river_params, sklearn_params): """Checks that the sklearn and river implementations produce the same results.""" ss = preprocessing.StandardScaler() rv = lm.LinearRegression(**river_params) sk = sklm.SGDRegressor(**sklearn_params) for x, y in datasets.TrumpApproval().take(100): x = ss.learn_one(x).transform_one(x) rv.learn_one(x, y) sk.partial_fit([list(x.values())], [y]) for i, w in enumerate(rv.weights.values()): assert math.isclose(w, sk.coef_[i]) assert math.isclose(rv.intercept, sk.intercept_[0])
def __init__( self, grace_period: int = 200, max_depth: int = None, split_confidence: float = 1e-7, tie_threshold: float = 0.05, leaf_prediction: str = "model", leaf_model: base.Regressor = None, model_selector_decay: float = 0.95, nominal_attributes: list = None, splitter: Splitter = None, min_samples_split: int = 5, binary_split: bool = False, max_size: int = 500, memory_estimate_period: int = 1000000, stop_mem_management: bool = False, remove_poor_attrs: bool = False, merit_preprune: bool = True, ): super().__init__( max_depth=max_depth, binary_split=binary_split, max_size=max_size, memory_estimate_period=memory_estimate_period, stop_mem_management=stop_mem_management, remove_poor_attrs=remove_poor_attrs, merit_preprune=merit_preprune, ) self._split_criterion: str = "vr" self.grace_period = grace_period self.split_confidence = split_confidence self.tie_threshold = tie_threshold self.leaf_prediction = leaf_prediction self.leaf_model = leaf_model if leaf_model else linear_model.LinearRegression( ) self.model_selector_decay = model_selector_decay self.nominal_attributes = nominal_attributes self.min_samples_split = min_samples_split if splitter is None: self.splitter = EBSTSplitter() else: if splitter.is_target_class: raise ValueError( "The chosen splitter cannot be used in regression tasks.") self.splitter = splitter
def __init__( self, n_min: int = 200, delta: float = 1e-7, tau: float = 0.05, pred_type: str = "adaptive", pred_model: base.Regressor = None, splitter: Splitter = None, drift_detector: base.DriftDetector = None, alpha: float = 0.99, anomaly_threshold: float = -0.75, m_min: int = 30, ordered_rule_set: bool = True, min_samples_split: int = 5, ): self.n_min = n_min self.delta = delta self.tau = tau if pred_type not in self._VALID_PRED: raise ValueError(f"Invalid 'pred_type': {pred_type}") self.pred_type = pred_type self.pred_model = pred_model if pred_model else linear_model.LinearRegression( ) if splitter is None: self.splitter = tree.splitter.EBSTSplitter() else: self.splitter = splitter self.drift_detector = (drift_detector if drift_detector is not None else drift.PageHinkley()) self.alpha = alpha self.anomaly_threshold = anomaly_threshold self.m_min = m_min self.ordered_rule_set = ordered_rule_set self.min_samples_split = min_samples_split self._default_rule = self._new_rule() self._rules: typing.Dict[typing.Hashable, RegRule] = {} self._n_drifts_detected: int = 0
def test_lin_reg_sklearn_coherence(): """Checks that the sklearn and river implementations produce the same results.""" class SquaredLoss: """sklearn removes the leading 2 from the gradient of the squared loss.""" def gradient(self, y_true, y_pred): return y_pred - y_true ss = preprocessing.StandardScaler() cr = lm.LinearRegression(optimizer=optim.SGD(.01), loss=SquaredLoss()) sk = sklm.SGDRegressor(learning_rate='constant', eta0=.01, alpha=.0) for x, y in datasets.TrumpApproval(): x = ss.learn_one(x).transform_one(x) cr.learn_one(x, y) sk.partial_fit([list(x.values())], [y]) for i, w in enumerate(cr.weights.values()): assert math.isclose(w, sk.coef_[i]) assert math.isclose(cr.intercept, sk.intercept_[0])
def __init__( self, grace_period: int = 200, max_depth: int = None, split_confidence: float = 1e-7, tie_threshold: float = 0.05, leaf_prediction: str = "model", leaf_model: base.Regressor = None, model_selector_decay: float = 0.95, nominal_attributes: list = None, splitter: Splitter = None, min_samples_split: int = 5, **kwargs, ): super().__init__(max_depth=max_depth, **kwargs) self._split_criterion: str = "vr" self.grace_period = grace_period self.split_confidence = split_confidence self.tie_threshold = tie_threshold self.leaf_prediction = leaf_prediction self.leaf_model = leaf_model if leaf_model else linear_model.LinearRegression() self.model_selector_decay = model_selector_decay self.nominal_attributes = nominal_attributes self.min_samples_split = min_samples_split if splitter is None: self.splitter = EBSTSplitter() else: if splitter.is_target_class: raise ValueError( "The chosen splitter cannot be used in regression tasks." ) self.splitter = splitter self.kwargs = kwargs
def _unit_test_params(cls): return { "regressor": linear_model.LinearRegression(), "p": 0.1, "size": 40 }
import pytest from river import compose, linear_model, optim, preprocessing, tree, utils @pytest.mark.parametrize( "model, param_grid, count", [ ( linear_model.LinearRegression(), { "optimizer": [ (optim.SGD, {"lr": [1, 2]}), ( optim.Adam, { "beta_1": [0.1, 0.01, 0.001], "lr": [0.1, 0.01, 0.001, 0.0001], }, ), ] }, 2 + 3 * 4, ), ( preprocessing.StandardScaler() | linear_model.LinearRegression(), { "LinearRegression": { "optimizer": [ (optim.SGD, {"lr": [1, 2]}), (
def build_model_4snarimax(self): if os.path.exists( self.pck_filename ): #if model backup exists then load it and update model from start1 to start2 src_bck = pickle.load(open(self.pck_filename, 'rb')) model = src_bck.snarimax_model metric = src_bck.snarimax_metric self.snarimax_para = src_bck.snarimax_para self.snarimax_model = model self.snarimax_metric = metric start1 = src_bck.data.index[-1] start2 = self.data.index[ -1] #self.data.index[-self.data.index[-1].weekday()] else: #if model backup does not exist then rebuild model from the start p, d, q, m, sp, sd, sq = self.snarimax_para extract_features = compose.TransformerUnion(get_ordinal_date) model = ( extract_features | time_series.SNARIMAX( p=p, d=d, q=q, m=m, sp=sp, sd=sd, sq=sq, regressor=( #preprocessing.Normalizer() | preprocessing.AdaptiveStandardScaler(alpha=0.1) | preprocessing.StandardScaler() | #preprocessing.RobustScaler(with_scaling=True) | linear_model.LinearRegression( intercept_init=0, optimizer=optim.SGD(0.0001), #important parameter #optimizer=optim.AdaDelta(0.8,0.00001), #important parameter #optimizer=optim.AMSGrad(lr=0.01,beta_1=0.8,beta_2=0.1), intercept_lr=0.001)))) metric = metrics.Rolling(metrics.MSE(), self.dd_historic) #metric = metrics.MSE() start1 = self.data.index[0] start2 = self.data.index[ -1] #self.data.index[-self.data.index[-1].weekday()] if start1 < start2: for t in pd.date_range(start1, start2, freq='D'): x, y = self.snarimax_data.loc[t][['ds', 'temp']].values y_pred = model.forecast(horizon=1, xs=[x]) #print(x,y,y_pred[0],y-y_pred[0]) model = model.learn_one(x, y) metric = metric.update(y, y_pred[0]) self.snarimax_model = model self.snarimax_metric = metric with open(self.pck_filename, 'wb') as fh: pickle.dump(self, fh) #for t in pd.date_range(start1, start2): # x = self.snarimax_data.loc[pd.date_range(t-timedelta(self.dd_historic),t)][['ds']].values # y = self.snarimax_data.loc[pd.date_range(t-timedelta(self.dd_historic),t)][['temp']].values # x = np.hstack(x) # y = np.hstack(y) # y_pred = model.forecast(horizon=self.dd_historic+1, xs=x) # for i in range(0,self.dd_historic): # model = model.learn_one(x[i], y[i]) # metric = metric.update(y[i], y_pred[i]) return
for _, obj in inspect.getmembers(importlib.import_module(submodule), is_estimator): if issubclass(obj, ignored): continue params = obj._unit_test_params() yield obj(**params) @pytest.mark.parametrize( "estimator, check", [ pytest.param(estimator, check, id=f"{estimator}:{check.__name__}") for estimator in list(get_all_estimators()) + [ feature_extraction.TFIDF(), linear_model.LogisticRegression(), preprocessing.StandardScaler() | linear_model.LinearRegression(), preprocessing.StandardScaler() | linear_model.PAClassifier(), (preprocessing.StandardScaler() | multiclass.OneVsRestClassifier( linear_model.LogisticRegression())), (preprocessing.StandardScaler() | multiclass.OneVsRestClassifier(linear_model.PAClassifier())), naive_bayes.GaussianNB(), preprocessing.StandardScaler(), cluster.KMeans(n_clusters=5, seed=42), preprocessing.MinMaxScaler(), preprocessing.MinMaxScaler() + preprocessing.StandardScaler(), feature_extraction.PolynomialExtender(), (feature_extraction.PolynomialExtender() | preprocessing.StandardScaler() | linear_model.LinearRegression()),
def _unit_test_params(cls): yield {"model": linear_model.LinearRegression()}
def _default_params(cls): return {'model': linear_model.LinearRegression()}
n_samples=1440) def __iter__(self): return stream.iter_csv(self.path, target='interval_qps', converters={'interval_qps': int}) def get_ordinal_date(x): return {'ordinal_date': int(x['secs_elapsed'])} model = compose.Pipeline( ('ordinal_date', compose.FuncTransformer(get_ordinal_date)), ('scale', preprocessing.MinMaxScaler()), ('lin_reg', linear_model.LinearRegression())) from river import metrics import matplotlib.pyplot as plt # target_data = "../log_traces/Mixgraph/1000_0.0000073_45000/report.csv" target_data = "../log_traces/StorageMaterial.NVMeSSD/12CPU/64MB/report.csv_1180" import os target_data = os.path.abspath(target_data) def evaluate_model(model): metric = metrics.Rolling(metrics.MAE(), 12) # dates = []
def _unit_test_params(cls): return {"model": linear_model.LinearRegression()}
import pytest from sklearn.utils import estimator_checks from sklearn import linear_model as sk_linear_model from river import base from river import cluster from river import compat from river import linear_model from river import preprocessing @pytest.mark.parametrize( "estimator", [ pytest.param(estimator, id=str(estimator)) for estimator in [ linear_model.LinearRegression(), linear_model.LogisticRegression(), preprocessing.StandardScaler(), cluster.KMeans(seed=42), ] ], ) @pytest.mark.filterwarnings( "ignore::sklearn.utils.estimator_checks.SkipTestWarning") def test_river_to_sklearn_check_estimator(estimator: base.Estimator): skl_estimator = compat.convert_river_to_sklearn(estimator) estimator_checks.check_estimator(skl_estimator) @pytest.mark.filterwarnings( "ignore::sklearn.utils.estimator_checks.SkipTestWarning")