def box_cox_trans_attribute( self, attribute, lamda): # boxcox transformation of an attribute in train_x self._train_data_set[attribute] = boxcox( self._train_data_set[attribute], lamda) self._test_data_set[attribute] = boxcox(self._test_data_set[attribute], lamda)
def test_nonfinite(): x = np.array([-1, -0.5]) y = boxcox(x, [0.5, -1.5]) yield assert_equal, y, np.array([np.nan, np.nan]) x = 0 y = boxcox(x, [-2.5, 0]) yield assert_equal, y, np.array([-np.inf, -np.inf])
def test_boxcox_nonfinite(): # x < 0 => y = nan x = np.array([-1, -1, -0.5]) y = boxcox(x, [0.5, 2.0, -1.5]) yield assert_equal, y, np.array([np.nan, np.nan, np.nan]) # x = 0 and lambda <= 0 => y = -inf x = 0 y = boxcox(x, [-2.5, 0]) yield assert_equal, y, np.array([-np.inf, -np.inf])
def box_cox_trans_attribute( self, attribute, lamda): # boxcox transformation of an attribute in train_x self._train_data_set[attribute] = boxcox( self._train_data_set[attribute], lamda) if not self._test_data_set.empty: self._test_data_set[attribute] = boxcox( self._test_data_set[attribute], lamda) else: print("no test data set")
def predict_price(location, area, bedrooms, bathrooms): loc_index = np.where(X.columns==location)[0][0] # X is an np array so we use where method to loc the index x= np.zeros(len(X.columns)) x[0] = boxcox(area,0) x[1] = boxcox(bedrooms,0) x[2] = boxcox(bathrooms,0) if loc_index >= 0: x[loc_index] = 1 return "The rental predicted price for this house is " + " ".join((str(round(inv_boxcox(model.predict([x])[0],0))), "euros per month."))
def test_basic(): x = np.array([1,2,3]) y = boxcox(x, 0) yield assert_almost_equal, y, np.log(x) y = boxcox(x, 1) yield assert_almost_equal, y, x - 1 y = boxcox(x, 2) yield assert_almost_equal, y, 0.5*(x**2 - 1) lam = np.array([0.5, 1, 2]) y = boxcox(0, lam) yield assert_almost_equal, y, -1.0 / lam x = np.array([-1.0, -0.5]) y = boxcox(x, np.array([[1],[2]])) yield assert_almost_equal, y, np.array([[-2, -1.5], [0, -0.375]])
def generate(x, y, filename): """Generate fixture data and write to file. # Arguments * `x`: domain * `y`: domain * `name::str`: filename of the output file # Examples ```python python> x = np.linspace(-10.0, 10.0, 2001) python> y = np.arange(-5.0, 5.0, 1001) python> generate(x, y, './data.json') ``` """ z = boxcox(x, y) data = dict( x=x.tolist(), y=y.tolist(), expected=z.tolist() ) filepath = path.join(DIR, filename) with open(filepath, 'w') as out: json.dump(data, out)
def fit(self, input_data): """ Class fit arima model on data :param input_data: data with features, target and ids to process """ source_ts = np.array(input_data.features) # Save actual time series length self.actual_ts_len = len(source_ts) self.sts = source_ts # Apply box-cox transformation for positive values min_value = np.min(source_ts) if min_value > 0: pass else: # Making a shift to positive values self.scope = abs(min_value) + 1 source_ts = source_ts + self.scope _, self.lambda_value = stats.boxcox(source_ts) transformed_ts = boxcox(source_ts, self.lambda_value) # Set parameters p = int(self.params.get('p')) d = int(self.params.get('d')) q = int(self.params.get('q')) params = {'order': (p, d, q)} self.arima = ARIMA(transformed_ts, **params).fit() return self.arima
def transform(self, X): if self.transform_cols is None: raise NotFittedError( f"This {self.__class__.__name__} instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator." ) new_X = X.copy() for column in self.transform_cols: new_X[column] = boxcox(new_X[column], self.lmbda) # Transformed skewness & kurtosis skew_df = new_X[self.transform_cols].skew().to_frame( name='Skewness (Box Cox)') kurt_df = new_X[self.transform_cols].kurt().to_frame( name='Kurtosis (Box Cox)') stat_df = skew_df.merge(kurt_df, left_index=True, right_index=True, how='left') self.stat_df = self.stat_df.merge(stat_df, left_index=True, right_index=True, how='left') return new_X
def get_estimated_rent(area, sq_mt, bedrooms, bathrooms): try: loc_index = __data_columns.index(area.lower( )) # From a list, we can get the index by simply using .index() except: loc_index = -1 x = np.zeros(len(__data_columns)) x[0] = boxcox(sq_mt, 0) x[1] = boxcox(bedrooms, 0) x[2] = boxcox(bathrooms, 0) if loc_index >= 0: x[loc_index] = 1 return round( inv_boxcox(__model.predict([x])[0], 0) ) # this is how we call our model. x is the input in the form of a 2D array
def test_boxcox_basic(): x = np.array([0.5, 1, 2, 4]) # lambda = 0 => y = log(x) y = boxcox(x, 0) yield assert_almost_equal, y, np.log(x) # lambda = 1 => y = x - 1 y = boxcox(x, 1) yield assert_almost_equal, y, x - 1 # lambda = 2 => y = 0.5*(x**2 - 1) y = boxcox(x, 2) yield assert_almost_equal, y, 0.5*(x**2 - 1) # x = 0 and lambda > 0 => y = -1 / lambda lam = np.array([0.5, 1, 2]) y = boxcox(0, lam) yield assert_almost_equal, y, -1.0 / lam
def target_transform(self, Y): #an empirical parameter if self.TRANSFORM: Y_t = boxcox( Y, self.LAMBDA) # Y_t = np.log(Y) else: #no transform Y_t = Y return Y_t
def test_inv_boxcox(): x = np.array([0., 1., 2.]) lam = np.array([0., 1., 2.]) y = boxcox(x, lam) x2 = inv_boxcox(y, lam) assert_almost_equal(x, x2) x = np.array([0., 1., 2.]) lam = np.array([0., 1., 2.]) y = boxcox1p(x, lam) x2 = inv_boxcox1p(y, lam) assert_almost_equal(x, x2)
def _transform(self, Z, X=None): """Transform data. Parameters ---------- Z : pd.Series Series to transform. X : pd.DataFrame, optional (default=None) Exogenous data used in transformation. Returns ------- Zt : pd.Series Transformed series. """ z = check_series(Z, enforce_univariate=True) zt = boxcox(z.to_numpy(), self.lambda_) return pd.Series(zt, index=z.index)
def _transform(self, X, y=None): """Transform X and return a transformed version. private _transform containing the core logic, called from transform Parameters ---------- X : 2D np.ndarray (n x 1) Data to be transformed y : ignored argument for interface compatibility Additional data, e.g., labels for transformation Returns ------- Xt : 2D np.ndarray transformed version of X """ X_shape = X.shape Xt = boxcox(X.flatten(), self.lambda_) Xt = Xt.reshape(X_shape) return Xt
def transform(self, x): """ Parameters ---------- x Returns ------- DataFrame Box-Cox transformed data. """ x = self._check_type(x) xs = [] for i, col in enumerate(x.T): if np.all(col > 0): self._shift[i] = 0. else: self._shift[i] -= col[~np.isnan(col)].min() _lmd = self._lmd[i] _shift = self._shift[i] for case in Switch(_lmd): if case(np.inf): x = col break if case(np.nan): x = np.full(col.shape, np.nan) break if case(): x = boxcox(col + _shift, _lmd) xs.append(x.reshape(-1, 1)) xs = np.concatenate(xs, axis=1) if len(self._shape) == 1: return xs.ravel() return xs.reshape(-1, self._shape[1])
def apply(self, ds): assert not self.shifting_factors, 'This function cannot be called twice.' ds = ds.astype('float64') for name, lmbda, boundary_location in \ zip(self.var_names, self.lmbdas, self.boundary_locations): if boundary_location == 'right': ds = ds.assign({name: -ds[name]}) sample_dim = ds[name].dims[0] stacked, stack_info = util.to_stacked_array(ds[[name]]) mins = stacked.min(sample_dim) # feature shifting_factor_per_feature = abs( mins) + NUMERICAL_OFFSET # feature shifting_factor_per_feature.load()[mins >= NUMERICAL_OFFSET] = 0. self.shifting_factors.append(shifting_factor_per_feature) transformed = boxcox(stacked + shifting_factor_per_feature, lmbda) unstacked = util.to_unstacked_dataset(transformed.values, stack_info) ds = ds.assign({name: unstacked[name]}) return ds
def _boxcox(x, lmbda=None, bounds=None, alpha=None): r"""Return a dataset transformed by a Box-Cox power transformation. Parameters ---------- x : ndarray Input array. Must be positive 1-dimensional. Must not be constant. lmbda : {None, scalar}, optional If `lmbda` is not None, do the transformation for that value. If `lmbda` is None, find the lambda that maximizes the log-likelihood function and return it as the second output argument. alpha : {None, float}, optional If ``alpha`` is not None, return the ``100 * (1-alpha)%`` confidence interval for `lmbda` as the third output argument. Must be between 0.0 and 1.0. Returns ------- boxcox : ndarray Box-Cox power transformed array. maxlog : float, optional If the `lmbda` parameter is None, the second returned argument is the lambda that maximizes the log-likelihood function. (min_ci, max_ci) : tuple of float, optional If `lmbda` parameter is None and ``alpha`` is not None, this returned tuple of floats represents the minimum and maximum confidence limits given ``alpha``. See Also -------- probplot, boxcox_normplot, boxcox_normmax, boxcox_llf Notes ----- The Box-Cox transform is given by:: y = (x**lmbda - 1) / lmbda, for lmbda > 0 log(x), for lmbda = 0 `boxcox` requires the input data to be positive. Sometimes a Box-Cox transformation provides a shift parameter to achieve this; `boxcox` does not. Such a shift parameter is equivalent to adding a positive constant to `x` before calling `boxcox`. The confidence limits returned when ``alpha`` is provided give the interval where: .. math:: llf(\hat{\lambda}) - llf(\lambda) < \frac{1}{2}\chi^2(1 - \alpha, 1), with ``llf`` the log-likelihood function and :math:`\chi^2` the chi-squared function. References ---------- G.E.P. Box and D.R. Cox, "An Analysis of Transformations", Journal of the Royal Statistical Society B, 26, 211-252 (1964). """ x = np.asarray(x) if x.ndim != 1: raise ValueError("Data must be 1-dimensional.") if x.size == 0: return x if np.all(x == x[0]): raise ValueError("Data must not be constant.") if any(x <= 0): raise ValueError("Data must be positive.") if lmbda is not None: # single transformation return special.boxcox(x, lmbda) # If lmbda=None, find the lmbda that maximizes the log-likelihood function. lmax = _boxcox_normmax(x, bounds=bounds, method="mle") y = _boxcox(x, lmax) if alpha is None: return y, lmax else: # Find confidence interval interval = _boxcox_conf_interval(x, lmax, alpha) return y, lmax, interval
def getPredict(region, sproduct, scale=1.96): connection = pymysql.connect(host='localhost', user='******', password='******', db='price', charset='utf8mb4', cursorclass=DictCursor) print('region {} product {}'.format(region, sproduct)) df = pd.read_sql( "SELECT ymd, price FROM price.tab WHERE region = '{}' and products='{}'" .format(region, sproduct), con=connection) #df.set_index('ymd') df.price = boxcox(df.price, lmbda) X_train, X_test, y_train, y_test = prepareData( df, test_size=12, lag_start=12, lag_end=24, ) dtrain = xgb.DMatrix(X_train, label=y_train) dtest = xgb.DMatrix(X_test) # задаём параметры params = {'objective': 'reg:squarederror', 'booster': 'gbtree'} trees = 1000 # прогоняем на кросс-валидации с метрикой rmse cv = xgb.cv(params, dtrain, metrics=('rmse'), verbose_eval=False, nfold=10, show_stdv=False, num_boost_round=trees, seed=0) # обучаем xgboost с оптимальным числом деревьев, подобранным на кросс-валидации bst = xgb.train(params, dtrain, num_boost_round=cv['test-rmse-mean'].values.argmin()) # можно построить кривые валидации # cv.plot(y=['test-mae-mean', 'train-mae-mean']) # запоминаем ошибку на кросс-валидации # deviation = cv.loc[cv['test-rmse-mean'].argmin()]["test-rmse-mean"] fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 10)) # посмотрим, как модель вела себя на тренировочном отрезке ряда prediction_train = inv_boxcox(bst.predict(dtrain), lmbda) y_train = inv_boxcox(y_train, lmbda) ax1.plot(y_train, label="y_train") ax1.plot(prediction_train, label="prediction") ax1.axis('tight') ax1.grid(True) ax1.legend() ax1.set_title("{} \n MAPE {}".format( sproduct, round(mean_absolute_percentage_error(y_train, prediction_train)))) # и на тестовом prediction_test = inv_boxcox(bst.predict(dtest), lmbda) y_test = inv_boxcox(y_test, lmbda) ax2.plot(list(y_test), label="y_test") ax2.plot(prediction_test, label="prediction") ax2.axis('tight') ax2.grid(True) ax2.legend() ax2.set_title("{} \n MAPE {}".format( sproduct, round(mean_absolute_percentage_error(y_test, prediction_test)))) plt.show() connection.close()
d = namespace.d q = namespace.q sp = namespace.sp sd = namespace.sd sq = namespace.sq ss = namespace.ss datein = namespace.datein dateout = namespace.dateout timeforcast = namespace.timeforcast df = pd.read_sql( 'SELECT ymd, price FROM price.tab_price WHERE region = "{}" and product="{}" and ymd > "{}" and ymd < "{}" ' 'ORDER BY ymd '.format(region, product, datein, dateout), con=connection) df['price'] = boxcox(df['price'], lmbda) df['ymd'] = pd.to_datetime(df['ymd']) df = df.set_index('ymd') mod = sm.tsa.statespace.SARIMAX(df['price'], order=(p, d, q), seasonal_order=(sp, sd, sq, ss)) res = mod.fit(disp=False) param = {'p':p, 'd':d, 'q':q, 'sp':sp, 'sd':sd, 'sq':sq, 'ss':ss} datend = df['price'].index[-1] + (relativedelta(months=+(nforecast))) if timeforcast == "m" else (dateout + relativedelta(weeks=+(nforecast))) predict = res.get_prediction(start = df['price'].index[0], end= datend) p_main = inv_boxcox(predict.predicted_mean, lmbda)
def box_cox_target(self, lamda): self._y_train = boxcox(self._y_train, lamda)
def test_boxcox_underflow(): x = 1 + 1e-15 lmbda = 1e-306 y = boxcox(x, lmbda) assert_allclose(y, np.log(x), rtol=1e-14)
def boxcox(x, lmbda=None, bounds=None, alpha=None): r""" Return a dataset transformed by a Box-Cox power transformation. Parameters ---------- x : ndarray Input array. Must be positive 1-dimensional. Must not be constant. lmbda : {None, scalar}, optional If `lmbda` is not None, do the transformation for that value. If `lmbda` is None, find the lambda that maximizes the log-likelihood function and return it as the second output argument. alpha : {None, float}, optional If ``alpha`` is not None, return the ``100 * (1-alpha)%`` confidence interval for `lmbda` as the third output argument. Must be between 0.0 and 1.0. Returns ------- boxcox : ndarray Box-Cox power transformed array. maxlog : float, optional If the `lmbda` parameter is None, the second returned argument is the lambda that maximizes the log-likelihood function. (min_ci, max_ci) : tuple of float, optional If `lmbda` parameter is None and ``alpha`` is not None, this returned tuple of floats represents the minimum and maximum confidence limits given ``alpha``. See Also -------- probplot, boxcox_normplot, boxcox_normmax, boxcox_llf Notes ----- The Box-Cox transform is given by:: y = (x**lmbda - 1) / lmbda, for lmbda > 0 log(x), for lmbda = 0 `boxcox` requires the input data to be positive. Sometimes a Box-Cox transformation provides a shift parameter to achieve this; `boxcox` does not. Such a shift parameter is equivalent to adding a positive constant to `x` before calling `boxcox`. The confidence limits returned when ``alpha`` is provided give the interval where: .. math:: llf(\hat{\lambda}) - llf(\lambda) < \frac{1}{2}\chi^2(1 - \alpha, 1), with ``llf`` the log-likelihood function and :math:`\chi^2` the chi-squared function. References ---------- G.E.P. Box and D.R. Cox, "An Analysis of Transformations", Journal of the Royal Statistical Society B, 26, 211-252 (1964). Examples -------- >>> from scipy import stats >>> import matplotlib.pyplot as plt We generate some random variates from a non-normal distribution and make a probability plot for it, to show it is non-normal in the tails: >>> fig = plt.figure() >>> ax1 = fig.add_subplot(211) >>> x = stats.loggamma.rvs(5, size=500) + 5 >>> prob = stats.probplot(x, dist=stats.norm, plot=ax1) >>> ax1.set_xlabel('') >>> ax1.set_title('Probplot against normal distribution') We now use `boxcox` to transform the data so it's closest to normal: >>> ax2 = fig.add_subplot(212) >>> xt, _ = stats.boxcox(x) >>> prob = stats.probplot(xt, dist=stats.norm, plot=ax2) >>> ax2.set_title('Probplot after Box-Cox transformation') >>> plt.show() """ x = np.asarray(x) if x.ndim != 1: raise ValueError("Data must be 1-dimensional.") if x.size == 0: return x if np.all(x == x[0]): raise ValueError("Data must not be constant.") if any(x <= 0): raise ValueError("Data must be positive.") if lmbda is not None: # single transformation return special.boxcox(x, lmbda) # If lmbda=None, find the lmbda that maximizes the log-likelihood function. lmax = boxcox_normmax(x, bounds=bounds, method='mle') y = boxcox(x, lmax) if alpha is None: return y, lmax else: # Find confidence interval interval = _boxcox_conf_interval(x, lmax, alpha) return y, lmax, interval
def getPredict(namespace): id = namespace.id region = namespace.region sproduct = namespace.product lmbda = namespace.lmbda season = namespace.season p = namespace.p d = namespace.d q = namespace.q sp = namespace.sp sd = namespace.sd sq = namespace.sq ss = namespace.ss datein = namespace.datein dateout = namespace.dateout df = pd.read_sql( 'SELECT ymd, price FROM price.tab_price WHERE region = "{}" and product="{}" and ymd > "{}" and ymd < "{}"' .format(region, sproduct, datein, dateout), con=connection) dta = df.price.values train = boxcox(dta, lmbda) n_p = range(0, p) n_d = range(0, d) n_q = range(0, q) n_sp = range(0, sp) n_sd = range(0, sd) n_sq = range(0, sq) n_ss = range(0, ss) print(n_p) parameters = product(n_p, n_d, n_q, n_sp, n_sd, n_sq, n_ss) parameters_list = list(parameters) best_aic = float("inf") best_bic = float("inf") best_hqic = float("inf") niter = 0 for param in parameters_list: niter += 1 try: model = sm.tsa.statespace.SARIMAX( train, order=(param[0], param[1], param[2]), seasonal_order=(param[3], param[4], param[5], int(param[6] * season)), enforce_invertibility=False) res = model.fit(disp=-1) except: #print('wrong parameters:', param) continue aic = res.aic if aic < best_aic: best_aic = aic best_param_aic = param bic = res.bic if bic < best_bic: best_bic = bic best_param_bic = param hqic = res.hqic if hqic < best_hqic: best_hqic = hqic best_param_hqic = param return [best_param_aic, best_param_bic, best_param_hqic]
products = [ 'Молоко сырое крупного рогатого скота', 'Пшеница мягкая 3 класса', 'Пшеница мягкая 5 класса', 'Ячмень', 'Гречиха', 'Семена подсолнечника', 'Свекла столовая', 'Птица сельскохозяйственная живая', 'Олени северные', 'Картофель' ] for sproduct in products: df = pd.read_sql( 'SELECT ymd, price FROM price.tab WHERE region = "{}" and products="{}"' .format(region, sproduct), con=connection) dta = df.price.values[start:] #dta = dta.reindex() xt = boxcox(dta, lmbda) train = xt[:len(xt) - nforecast] df = pd.read_sql( 'SELECT * FROM price.model WHERE region = "{}" and product="{}"'. format(region, sproduct), con=connection) # Graph fig, ax = plt.subplots(figsize=(12, 10)) ax.xaxis.grid() ax.yaxis.grid() ax.plot(inv_boxcox(xt, lmbda), 'k.') for param in df.iterrows(): mod = sm.tsa.statespace.SARIMAX(
else: return (np.exp(np.log(ld * y + 1) / ld)) y = train.血糖.values print(y) # We use the numpy function log1p which applies log(1+x) to all elements of the column # train[target_item] = np.add(10**15 * train[target_item], 0) # train[target_item] = np.log1p(train[target_item]) # train[target_item] = np.sin(0.177*train[target_item]-0.08) # sin不能一一对应? # train[target_item] = np.arctan(train[target_item] - 3.17) # train[target_item] = np.log1p(train[target_item]) # train[target_item] = np.arctan(0.6 * train[target_item] - 1.6) # !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! # train[target_item] = np.log1p(train[target_item]**-2) # train[target_item] = np.arctan(0.4 * train[target_item] - .8) train["血糖"] = boxcox(train["血糖"], 0.15) """ for i in range(len(train[target_item])): pass """ ''' i = 0 for item in test_item: fig, ax = plt.subplots() ax.scatter(x = train[item], y = train[target_item]) plt.ylabel(target_item, fontsize=13) plt.xlabel(item + f' {i}', fontsize=13) # plt.show() name = item if item[0] == '*': name = name[1:]
def getTrainData(region, product, datein, dateout, lag, lagVal, lagS, AvPr, AvPrVal, test_size): df_train = pd.read_sql( 'SELECT ymd, price FROM price.tab WHERE region = "{}" and products="{}" and ymd > "{}" and ymd < "{}"' .format(region, product, datein, dateout), con=connection) if len(df_train) == 0: return (df_train, 0) test_index = int(len(df_train) - test_size) df_train = df_train[:test_index] df_valute = pd.read_sql( 'SELECT ValueVal, dateCalendar FROM price.valuta WHERE CharCode = "{}" ' .format("USD"), con=connection) data = pd.merge(df_train, df_valute, left_on='ymd', right_on='dateCalendar') data["price_boxcox"] = boxcox(data["price"], lmbda) trend = Pipeline([('poly', PolynomialFeatures(degree=2)), ('linear', LinearRegression(fit_intercept=False))]) x = data['ymd'].map(datetime.datetime.toordinal) trend.fit(x.values.reshape(-1, 1), data.price_boxcox.values.reshape(-1, 1)) data["Trend"] = trend.predict(data['ymd'].map( datetime.datetime.toordinal).values.reshape(-1, 1)) data["PriceWithOutTrend"] = data["price_boxcox"] - data["Trend"] data.loc[ data['price'] == 0, "PriceWithOutTrend"] = 0.01 # если цена равна нулю, то поставим среднюю for i in lag: data["PriceWithOutTrend{}".format(i)] = data.PriceWithOutTrend.shift(i) for i in lagS: if i != 0: data["PriceWithOutTrend{}".format( i)] = data.PriceWithOutTrend.shift(i) for i in lagVal: data["lagValute_{}".format(i)] = data.ValueVal.shift(i) # средние , максимум, минимум за квартал , полгода, год data.ymd = pd.to_datetime(data["ymd"]) data["month"] = data.ymd.dt.month meanPrice = data.groupby('month')['PriceWithOutTrend'].aggregate('mean') maxPrice = data.groupby('month')['PriceWithOutTrend'].aggregate('max') minPrice = data.groupby('month')['PriceWithOutTrend'].aggregate('min') data.loc[:, 'meanPrice'] = [meanPrice[month] for month in data['month']] data.loc[:, 'maxPrice'] = [maxPrice[month] for month in data['month']] data.loc[:, 'minPrice'] = [minPrice[month] for month in data['month']] df = data.set_index('ymd').resample('MS', label='right').first() df1 = df['PriceWithOutTrend'].shift().rolling( min_periods=1, window=AvPr).agg(['mean', 'median']).reset_index() data = pd.merge(data, df1, on=['ymd'], how='left') if AvPrVal != 0: df2 = df['ValueVal'].shift().rolling( min_periods=1, window=AvPrVal).agg(['mean', 'median']).reset_index() data = pd.merge(data, df2, on=['ymd'], how='left') data.drop(["price"], axis=1, inplace=True) data.drop(["price_boxcox"], axis=1, inplace=True) data.drop(["ymd"], axis=1, inplace=True) data.drop(["month"], axis=1, inplace=True) data.drop(["dateCalendar"], axis=1, inplace=True) #data.drop(["Trend"], axis=1, inplace=True) data = data.dropna() data = data.reset_index(drop=True) return data, trend
def transform(self, Z, X=None): self.check_is_fitted() z = check_series(Z, enforce_univariate=True) zt = boxcox(z.to_numpy(), self.lambda_) return pd.Series(zt, index=z.index)
def fit_transform(self, y): return boxcox(y, 0.5)
def getTestData(datal, train=False, lag=0, lagVal=0, lagS=0, AvPr=0, AvPrVal=0, winWeather=0, AvMonth=0, tr=1, twinter=100, tsummer=100, tsping=100, tautomn=100, rwinter=100, rsummer=100, rsping=100, rautomn=100, dateout=None, nforecast=1): global trend data = datal.copy() data["month"] = data.ymd.dt.month data["week"] = data.ymd.dt.weekofyear #data["year"] = data.ymd.dt.year #data['yearofchange'] = (data["ymd"] > datetime.datetime(2015,1,1)) data = data.replace({"price": {0: np.nan}}) data["price"].interpolate(inplace=True) data = data.fillna(method='bfill') data["price_boxcox"] = boxcox(data["price"], lmbda) if train: x = data['ymd'].map(datetime.datetime.toordinal) trend = np.poly1d(np.polyfit(x.values, data.price_boxcox.values, tr)) data["Trend"] = trend(data['ymd'].map(datetime.datetime.toordinal).values) data["PriceWithOutTrend"] = data["price_boxcox"] - data["Trend"] #data["diff"] = data["PriceWithOutTrend"] - data["PriceWithOutTrend"].shift(1) #data["diff2"] = data["PriceWithOutTrend"] - data["PriceWithOutTrend"].shift(2) #data.loc[0, "diff"] = 0 #data.loc[[0,1], "diff2"] = 0 #data.loc[data['price'] == 0, "PriceWithOutTrend"]= 0.1 # если цена равна нулю, то поставим среднюю for i in range(1, lag + 1): data["PriceWithOutTrend{}".format(i)] = data.PriceWithOutTrend.shift(i) #data["PriceWithOutTrend{}".format(i)].fillna(0, inplace=True) for i in [lagS]: if i != 0: data["PriceWithOutTrendS{}".format( i)] = data.PriceWithOutTrend.shift(i) #data["PriceWithOutTrendS{}".format(i)].fillna(0, inplace=True) for i in range(1, lagVal + 1): data["lagValute{}".format(i)] = data.ValueVal.shift(i) #data["lagValute{}".format(i)].fillna(0, inplace=True) # средние , максимум, минимум за квартал , полгода, год if AvMonth == 1: meanPrice = data[:-1].groupby('month')['PriceWithOutTrend'].aggregate( 'mean') maxPrice = data[:-1].groupby('month')['PriceWithOutTrend'].aggregate( 'max') minPrice = data[:-1].groupby('month')['PriceWithOutTrend'].aggregate( 'min') data.loc[:, 'meanPrice'] = [meanPrice[month] for month in data['month']] data.loc[:, 'maxPrice'] = [maxPrice[month] for month in data['month']] data.loc[:, 'minPrice'] = [minPrice[month] for month in data['month']] if AvPr != 0: df = data.set_index('ymd').resample('MS', label='right').first() df1 = df['PriceWithOutTrend'].shift().rolling( min_periods=1, window=AvPr).agg(['mean', 'max', 'min']).reset_index() df1 = df1.add_suffix('_AvPr') data = pd.merge(data, df1, left_on=['ymd'], right_on=['ymd_AvPr'], how='left') #data["mean_AvPr"].fillna(0, inplace=True) data.drop(["ymd_AvPr"], axis=1, inplace=True) for i in range(1, lag + 1): data["mean_AvPr{}".format(i)] = data.mean_AvPr.shift(i) data["mean_AvPr{}".format(i)].fillna(0, inplace=True) data["max_AvPr{}".format(i)] = data.max_AvPr.shift(i) data["max_AvPr{}".format(i)].fillna(0, inplace=True) data["min_AvPr{}".format(i)] = data.min_AvPr.shift(i) data["min_AvPr{}".format(i)].fillna(0, inplace=True) if AvPrVal != 0: df = data.set_index('ymd').resample('MS', label='right').first() df2 = df['ValueVal'].shift().rolling( min_periods=1, window=AvPrVal).agg(['mean', 'max', 'min']).reset_index() df2 = df2.add_suffix('_AvPrVal') data = pd.merge(data, df2, left_on=['ymd'], right_on=['ymd_AvPrVal'], how='left') #data["mean_AvPrVal"].fillna(0, inplace=True) data.drop(["ymd_AvPrVal"], axis=1, inplace=True) for i in range(1, lagVal + 1): data["mean_AvPrVal{}".format(i)] = data.mean_AvPrVal.shift(i) data["mean_AvPrVal{}".format(i)].fillna(0, inplace=True) data["max_AvPrVal{}".format(i)] = data.max_AvPrVal.shift(i) data["max_AvPrVal{}".format(i)].fillna(0, inplace=True) data["min_AvPrVal{}".format(i)] = data.min_AvPrVal.shift(i) data["min_AvPrVal{}".format(i)].fillna(0, inplace=True) if winWeather != 0: dt_weather = pd.read_sql( 'SELECT UTC as ymd, T, R FROM price.weather WHERE id = "{}" and UTC <= "{}"' .format("/weather.php?id=30710", dateout), con=connection) df = dt_weather.set_index('ymd').resample('D', label='right').agg({ 'T': 'mean', 'R': 'mean' }).reset_index() date_start_pred = dateout + relativedelta(months=+1) date_stop_pred = dateout + relativedelta(months=+(nforecast + 1)) #df['ymd'] = pd.to_datetime(df["ymd"]) df['dayofyear'] = df.ymd.dt.dayofyear meanT = df.groupby('dayofyear')['T'].mean() meanR = df.groupby('dayofyear')['R'].mean() for d in date_range(date_start_pred, date_stop_pred, datetime.timedelta(days=1)): df = pd.concat([ df, pd.DataFrame.from_dict({ 'ymd': [datetime.datetime(d.year, d.month, d.day)], 'T': [newT(d, meanT, twinter, tsummer, tsping, tautomn)], 'R': [newT(d, meanR, rwinter, rsummer, rsping, rautomn)], 'dayofyear': [1] }) ], ignore_index=True) #df['ymd'] = df.index df['indexmonth'] = df.apply(f_index_month, axis=1) df['indexmonth'] = df['indexmonth'].cumsum() df['cumT'] = df.groupby('indexmonth')['T'].cumsum() df['cumR'] = df.groupby('indexmonth')['R'].cumsum() df.loc[df['indexmonth'] % 2 == 0, 'cumT'] = 0 df.loc[df['indexmonth'] % 2 == 0, 'cumR'] = 0 #df = df.set_index('ymd') df.drop( 'dayofyear', inplace=True, axis=1, ) df.drop( 'T', inplace=True, axis=1, ) df.drop( 'R', inplace=True, axis=1, ) #df.drop('ymd2', inplace = True, axis=1,) df.drop( 'indexmonth', inplace=True, axis=1, ) for i in range(1, int(7 * lag), 7): df["cumT{}".format(i)] = df.cumT.shift(i) df["cumR{}".format(i)] = df.cumR.shift(i) data = pd.merge(data, df, left_on='ymd', right_on='ymd', how='left', suffixes=('data', 'dt_weather')) #data = pd.merge(data, df, on=['ymd'], how='left', suffixes=('data', 'df2')) if lagVal == 0: data.drop(["ValueVal"], axis=1, inplace=True) data.drop(["price"], axis=1, inplace=True) data.drop(["price_boxcox"], axis=1, inplace=True) data.drop(["ymd"], axis=1, inplace=True) #data.drop(["month"], axis=1, inplace=True) data.drop(["dateCalendar"], axis=1, inplace=True) #data.drop(["Trend"], axis=1, inplace=True) #data = data.dropna() data = data.reset_index(drop=True) return data
def getTestData(region, product, datein, dateout, lag, lagVal, lagS, test_size, trend, AvPr, AvPrVal, y_past): df_train = pd.read_sql( 'SELECT ymd, price FROM price.tab WHERE region = "{}" and products="{}" and ymd > "{}" and ymd < "{}"' .format(region, product, datein, dateout), con=connection) test_index = int(len(df_train) - test_size) ytrue = df_train.iloc[test_index + len(y_past), :].price df_train = df_train[:test_index] past_ymd = df_train.iloc[-1, :].ymd for indd, past_value in enumerate(y_past): df_train = df_train.append( { 'price': past_value, 'ymd': past_ymd + relativedelta(months=+1) }, ignore_index=True) past_ymd = past_ymd + relativedelta(months=+(1 + len(y_past))) df_train = df_train.append({ 'price': 0, 'ymd': past_ymd }, ignore_index=True) df_valute = pd.read_sql( 'SELECT ValueVal, dateCalendar FROM price.valuta WHERE CharCode = "{}" ' .format("USD"), con=connection) data = pd.merge(df_train, df_valute, left_on='ymd', right_on='dateCalendar') data["price_boxcox"] = boxcox(data["price"], lmbda) data["Trend"] = trend.predict(data['ymd'].map( datetime.datetime.toordinal).values.reshape(-1, 1)) data["PriceWithOutTrend"] = data["price_boxcox"] - data["Trend"] data.loc[ data['price'] == 0, "PriceWithOutTrend"] = 0.1 # если цена равна нулю, то поставим среднюю for i in lag: data["PriceWithOutTrend{}".format(i)] = data.PriceWithOutTrend.shift(i) for i in lagS: if i != 0: data["PriceWithOutTrend{}".format( i)] = data.PriceWithOutTrend.shift(i) for i in lagVal: data["lagValute_{}".format(i)] = data.ValueVal.shift(i) # средние , максимум, минимум за квартал , полгода, год data.ymd = pd.to_datetime(data["ymd"]) data["month"] = data.ymd.dt.month meanPrice = data[:-1].groupby('month')['PriceWithOutTrend'].aggregate( 'mean') maxPrice = data[:-1].groupby('month')['PriceWithOutTrend'].aggregate('max') minPrice = data[:-1].groupby('month')['PriceWithOutTrend'].aggregate('min') data.loc[:, 'meanPrice'] = [meanPrice[month] for month in data['month']] data.loc[:, 'maxPrice'] = [maxPrice[month] for month in data['month']] data.loc[:, 'minPrice'] = [minPrice[month] for month in data['month']] df = data.set_index('ymd').resample('MS', label='right').first() df1 = df['PriceWithOutTrend'].shift().rolling( min_periods=1, window=AvPr).agg(['mean', 'median']).reset_index() data = pd.merge(data, df1, on=['ymd'], how='left') if AvPrVal != 0: df2 = df['ValueVal'].shift().rolling( min_periods=1, window=AvPrVal).agg(['mean', 'median']).reset_index() data = pd.merge(data, df2, on=['ymd'], how='left') data.drop(["price"], axis=1, inplace=True) data.drop(["price_boxcox"], axis=1, inplace=True) data.drop(["ymd"], axis=1, inplace=True) data.drop(["month"], axis=1, inplace=True) data.drop(["dateCalendar"], axis=1, inplace=True) #data.drop(["Trend"], axis=1, inplace=True) data = data.dropna() data = data.reset_index(drop=True) return data, ytrue
def getPredictArima(region, sproduct, start=5): df_train = pd.read_sql( 'SELECT ymd, price FROM price.tab WHERE region = "{}" and products="{}"' .format(region, sproduct), con=connection) df_valute = pd.read_sql( 'SELECT ValueVal, dateCalendar FROM price.valuta WHERE CharCode = "{}" ' .format("USD"), con=connection) data = pd.merge(df_train, df_valute, left_on='ymd', right_on='dateCalendar') ex = data.ValueVal.values[start:] dta = df_train.price.values[start:] dttime = df_train.ymd.values[start:] xt = boxcox(dta, lmbda) train = xt[:len(xt) - nforecast] df = pd.read_sql( 'SELECT * FROM price.model WHERE region = "{}" and product="{}" and criterion = "aic"' .format(region, sproduct), con=connection) # Graph param = df.iloc[0] mod = sm.tsa.statespace.SARIMAX( train, order=(param.p, param.d, param.q), seasonal_order=(param.sp, param.sd, param.sq, param.ss), ) res = mod.fit(disp=False) predict = res.get_prediction(end=mod.nobs + nforecast - 1) p_main = inv_boxcox(predict.predicted_mean, lmbda) plt.figure(figsize=(10, 8)) plt.plot( dta[-nforecast:], 'bs', label="fact", ) plt.plot(p_main[-nforecast:], 'r--', label="arima. mape {}".format( round( mean_absolute_percentage_error(dta[-nforecast:], p_main[-nforecast:])))) LAG = 18 train = xt[LAG:len(xt) - nforecast] for lag in range(LAG): exog = ex[lag:len(xt) - nforecast + lag - LAG] mode = sm.tsa.statespace.SARIMAX(train, order=(param.p, param.d, param.q), seasonal_order=(param.sp, param.sd, param.sq, param.ss), exog=exog) rese = mode.fit(disp=False) exog_forecast = data.iloc[-nforecast - lag - 1:-lag - 1]['ValueVal'].values[..., np.newaxis] predicte = rese.get_prediction(end=mode.nobs + nforecast - 1, exog=exog_forecast) p_maine = inv_boxcox(predicte.predicted_mean, lmbda) plt.plot(p_maine[-nforecast:], label="arimaX. lag {} mape {}".format( LAG - lag, round( mean_absolute_percentage_error( dta[-nforecast:], p_maine[-nforecast:])))) # mean_absolute_percentage_error(dta[-nforecast:], p_main[-nforecast:]), 2), round(mean_absolute_percentage_error(dta[-nforecast:], p_maine[-nforecast:]), 2))) plt.axis('tight') plt.grid(True) plt.legend() plt.title("{} ".format(sproduct)) plt.show()
def regress(x,y,y_label): regr.fit(x,y) print "R squared: " + str(regr.score(x,y)) # Plot outputs fig = plt.figure() plt.scatter(y, regr.predict(x), color='blue') plt.xlabel(y_label) plt.ylabel('predicted') plt.show() regress(x,latitude,'latitude') regress(x,longitude,'longitude') def boxcox(x,y,y_label): box_cox, maxlog = stats.boxcox(y + abs(min(y)) + 1) regr.fit(x,box_cox) box_cox_predict = regr.predict(x) y_predict = inv_boxcox(box_cox_predict,maxlog) - abs(min(y)) - 1 print "R squared: " + str(np.var(y_predict)/np.var(y)) # Plot outputs fig = plt.figure() plt.scatter(y, y_predict, color='blue') plt.xlabel(y_label) plt.ylabel('predicted') plt.show() boxcox(x,latitude,'latitude') boxcox(x,longitude,'longitude')