def fit_transform(self, X, y=None): if len(X.shape) > 1: t = [stats.boxcox(X[:, col]) for col in range(X.shape[1])] xt, self.lambdas = zip(*t) return np.array(xt).T else: xt, self.lambdas = stats.boxcox(X) return np.array(xt)
def transform(self, X): X += self.shift if isinstance(self.lmbda, float): xb = boxcox(X, self.lmbda) else: xb = numpy.zeros(shape=X.shape) for j, lmb in enumerate(self.lmbda): xb[:, j] = boxcox(X[:, j], lmb) return (xb - self.xmean) / self.xstd
def better_loglikelihood(self,param_e): if self.result_producing_thing.typ=="emulator": self.result_producing_thing.emulate(param_e[0:-2]) if self.result_producing_thing.typ=="swmm": self.result_producing_thing.run(param_e[0:-2]) data=stats.boxcox((self.measurement>0)*self.measurement+0.01,0.35) mean=stats.boxcox((self.result_producing_thing.result>0)*self.result_producing_thing.result+0.01,0.35) covariance=param_e[-1]*self.cov_mat_b_base+\ self.cov_mat_e_base*param_e[-2] lik=-0.5*np.linalg.slogdet(covariance)[1]-\ 0.5*np.dot(mean-data,np.linalg.solve(covariance,mean-data))-\ 0.5*self.t*np.log(2*np.pi) return lik
def test_fixed_lmbda(self): np.random.seed(12345) x = stats.loggamma.rvs(5, size=50) + 5 xt = stats.boxcox(x, lmbda=1) assert_allclose(xt, x - 1) xt = stats.boxcox(x, lmbda=-1) assert_allclose(xt, 1 - 1/x) xt = stats.boxcox(x, lmbda=0) assert_allclose(xt, np.log(x)) # Also test that array_like input works xt = stats.boxcox(list(x), lmbda=0) assert_allclose(xt, np.log(x))
def executeOneSetting(tensor, density, roundId, para): logger.info('density=%.2f, %2d-round starts.'%(density, roundId + 1)) (numUser, numService, numTime) = tensor.shape dim = para['dimension'] # initialization U = np.random.rand(numUser, dim) S = np.random.rand(numService, dim) p = np.zeros(numUser) q = np.zeros(numService) # run for each time slice for sliceId in xrange(numTime): # boxcox data transformation matrix = tensor[:, :, sliceId] dataVector = matrix[:] (transfVector, alpha) = stats.boxcox(dataVector[dataVector > 0]) maxV = np.max(transfVector) minV = np.min(transfVector) transfMatrix = matrix.copy() transfMatrix[transfMatrix != -1] = stats.boxcox(transfMatrix[transfMatrix != -1], alpha) transfMatrix[transfMatrix != -1] = (transfMatrix[transfMatrix != -1] - minV) / (maxV - minV) # remove data entries to generate trainMatrix and testMatrix seedID = roundId + sliceId * 100 (trainMatrix, testMatrix) = evallib.removeEntries(matrix, density, seedID) trainMatrix = np.where(trainMatrix > 0, transfMatrix, 0) (testVecX, testVecY) = np.where(testMatrix) testVec = matrix[testVecX, testVecY] # invocation to the prediction function startTime = time.clock() predictedMatrix = AMF.predict(trainMatrix, U, S, p, q, para) runningTime = float(time.clock() - startTime) # evaluate the estimation error predVec = predictedMatrix[testVecX, testVecY] predVec = (maxV - minV) * predVec + minV predVec = evallib.argBoxcox(predVec, alpha) evalResult = evallib.errMetric(testVec, predVec, para['metrics']) result = (evalResult, runningTime) # dump the result at each density outFile = '%s%s_%s_result_%02d_%.2f_round%02d.tmp'%(para['outPath'], para['dataName'], para['dataType'], sliceId + 1, density, roundId + 1) evallib.dumpresult(outFile, result) logger.info('sliceId=%02d done.'%(sliceId + 1)) logger.info('density=%.2f, %2d-round done.'%(density, roundId + 1)) logger.info('----------------------------------------------')
def auto_arima(endog, freq=None, d=None, D=None, max_p=5, max_q=5, max_P=2, max_Q=2, max_order=5, max_d=2, max_D=1, start_p=2, start_q=2, start_P=1, start_Q=1, stationary=False, ic="aic", stepwise=True, trace=False, approximation=None, test="adf", seasonal_test="ch", allowdrift=True, allowmean=True, lambda_parameter=None, *args, **kwargs): # Parameter Validity Check if np.any(np.isnan(endog)): raise ValueError("Missing Values in Series") origin_endog = endog if _is_using_pandas(endog, None): endog = np.asarray(endog) if len(endog) <= 10: raise ValueError("There are too few observations.") if np.any(np.isnan(endog)): raise ValueError("NaN values in endogenous not allowed") if np.all(endog == endog[0]): raise ValueError("The endogenous variable is a constant") if (not isinstance(freq, int)) or freq <= 1: raise ValueError("The frequency parameter must be a integer greater than 1") if lambda_parameter is not None: if lambda_parameter < 0: raise ValueError("The Lambda parameter must be positive") if not np.all(endog > 0): raise ValueError("Box-Cox Transformation can be only used on positive series.") endog = boxcox(endog, lambda_parameter) max_p = max_p if max_p <= floor(len(endog) / 3) else floor(len(endog) / 3) max_q = max_q if max_q <= floor(len(endog) / 3) else floor(len(endog) / 3) max_P = max_P if max_P <= floor(len(endog) / 3 / freq) else floor(len(endog) / 3 / freq) max_Q = max_Q if max_Q <= floor(len(endog) / 3 / freq) else floor(len(endog) / 3 / freq) if stationary: D = 0 d = 0 if freq == 1:
def test_alpha(self): np.random.seed(1234) x = stats.loggamma.rvs(5, size=50) + 5 # Some regular values for alpha, on a small sample size _, _, interval = stats.boxcox(x, alpha=0.75) assert_allclose(interval, [4.004485780226041, 5.138756355035744]) _, _, interval = stats.boxcox(x, alpha=0.05) assert_allclose(interval, [1.2138178554857557, 8.209033272375663]) # Try some extreme values, see we don't hit the N=500 limit x = stats.loggamma.rvs(7, size=500) + 15 _, _, interval = stats.boxcox(x, alpha=0.001) assert_allclose(interval, [0.3988867, 11.40553131]) _, _, interval = stats.boxcox(x, alpha=0.999) assert_allclose(interval, [5.83316246, 5.83735292])
def test_mle(self): maxlog = stats.boxcox_normmax(self.x, method='mle') assert_allclose(maxlog, 1.758101, rtol=1e-6) # Check that boxcox() uses 'mle' _, maxlog_boxcox = stats.boxcox(self.x) assert_allclose(maxlog_boxcox, maxlog)
def readIn_PredictionData(fn,dfmax,dfmin,transformationFunction): df = pd.read_csv(fn,sep=",",header=False) # countij = 0 # for i in range(0,len(df.columns)): # for j in range(i+1,min(i+5,len(df.columns))): # countij = countij+1 # df['new'+str(countij)] = np.multiply(df[df.columns[i]],df[df.columns[j]]) print len(df.columns) for i in range(0,len(df.columns)): # if df.columns[i] != "selection": if transformationFunction == "bin": df[df.columns[i]] =[(x if x < 31 else 50 ) for x in df[df.columns[i]]] elif transformationFunction == "binlog": df[df.columns[i]] =[(0.5 if x==0 else (x if x < 31 else 50) ) for x in df[df.columns[i]]] df[df.columns[i]] = np.log(df[df.columns[i]]) elif transformationFunction == "log": df[df.columns[i]] =[(0.5 if x==0 else x) for x in df[df.columns[i]]] df[df.columns[i]] = np.log(df[df.columns[i]]) elif transformationFunction == "sqrt": df[df.columns[i]] = np.sqrt(df[df.columns[i]]) elif transformationFunction == "boxcox": df[df.columns[i]] = stats.boxcox(np.array(df[df.columns[i]]))[0] df[df.columns[i]] = normalize_predictioninput(np.array(df[df.columns[i]]),dfmax[i],dfmin[i]) return df
def sgs(data, xsteps=10, ysteps=10, nugget_dist=10, x_col='x_m', y_col='y_m', flux_col='flux', transform_data=True, invert_transform=True): x = data.x_m.values y = data.y_m.values flux = data.flux.values if transform_data: flux, L = scpstats.boxcox(flux) data = pd.DataFrame(np.c_[x, y, flux], columns=[x_col, y_col, flux_col]) new_x = [] new_y = [] new_flux = [] # create array for the output idx, grid, indexGrid, M = makePathAndGrid(data, xsteps, ysteps) for step in idx : point = [grid[0][step], grid[1][step]] model = kriging.krig_model(data, nugget_dist, x_col, y_col, flux_col) est = kriging.krig_sample(model, point) indexPoint = [indexGrid[0][step], indexGrid[1][step]] M[indexPoint[0], indexPoint[1]] = est x = np.r_[x, point[0]] new_x.append(x[-1]) y = np.r_[y, point[1]] new_y.append(y[-1]) flux = np.r_[flux, est] new_flux.append(flux[-1]) data = pd.DataFrame(np.c_[x, y, flux], columns=[x_col, y_col, flux_col]) if invert_transform and transform_data: M = invboxcox(M, L) new_flux = invboxcox(np.array(new_flux), L) return grid[0,:].reshape(M.shape), grid[1,:].reshape(M.shape), M, new_x, new_y, new_flux
def transform(self, x): x = np.asarray(x) if self.method == 'lambert': return np.array([self.w_t(x_i, tp_i) for x_i, tp_i in zip(x.T, self.trans_params)]).T elif self.method == 'boxcox': return np.array([boxcox(x_i, tp_i) for x_i, tp_i in zip(x.T, self.trans_params)]).T else: raise NotImplementedError
def processing(data): #构造新特征 create_feature(data); #丢弃特征 data.drop(to_drop,axis=1,inplace=True) #填充None值,因为在特征说明中,None也是某些特征的一个值,所以对于这部分特征的缺失值以None填充 fill_none = ['MasVnrType','BsmtExposure','GarageType','MiscFeature'] for col in fill_none: data[col].fillna('None',inplace=True); #对其他缺失值进行填充,离散型特征填充众数,数值型特征填充中位数 na_col = data.dtypes[data.isnull().any()]; for col in na_col.index: if na_col[col] != 'object': med = data[col].median(); data[col].fillna(med,inplace=True); else: mode = data[col].mode()[0]; data[col].fillna(mode,inplace=True); #对正态偏移的特征进行正态转换,numeric_col就是数值型特征,zero_col是含有零值的数值型特征 #因为如果对含零特征进行转换的话会有各种各种的小问题,所以干脆单独只对非零数值进行转换 numeric_col = data.skew().index; zero_col = data.columns[data.isin([0]).any()] for col in numeric_col: #对于那些condition特征,例如取值是0,1,2,3...那些我不作变换,因为意义不大 if len(pd.value_counts(data[col])) <= 10 : continue; #如果是含有零值的特征,则只对非零值变换,至于用哪种形式变换,boxcox会自动根据数据来调整 if col in zero_col: trans_data = data[data>0][col]; before = abs(trans_data.skew()); cox,_ = boxcox(trans_data) log_after = abs(Series(cox).skew()); if log_after < before: data.loc[trans_data.index,col] = cox; #如果是非零值的特征,则全部作转换 else: before = abs(data[col].skew()); cox,_ = boxcox(data[col]) log_after = abs(Series(cox).skew()); if log_after < before: data.loc[:,col] = cox; #mapper值的映射转换 for col,mapp in mapper.items(): data.loc[:,col] = data[col].map(mapp);
def boxcoxtransform(dataframe, numeric_feats): lam=defaultdict(float) skewed_feats = dataframe[numeric_feats].apply(lambda x: skew(x.dropna())) skewed_feats = skewed_feats[skewed_feats > 0.25] skewed_feats = skewed_feats.index for feats in skewed_feats: dataframe[feats] = dataframe[feats] + 1 dataframe[feats], lam[feats] = boxcox(dataframe[feats]) return dataframe, lam
def test_lmbda_None(self): np.random.seed(1234567) # Start from normal rv's, do inverse transform to check that # optimization function gets close to the right answer. np.random.seed(1245) lmbda = 2.5 x = stats.norm.rvs(loc=10, size=50000) x_inv = (x * lmbda + 1)**(-lmbda) xt, maxlog = stats.boxcox(x_inv) assert_almost_equal(maxlog, -1 / lmbda, decimal=2)
def fit(self, x): x = np.asarray(x) if self.method == 'lambert': for x_i in x.T: self.trans_params.append(self.iterate_moments(x_i, tol=self.tol, max_iter=self.max_iter)) elif self.method == 'boxcox': for x_i in x.T: self.trans_params.append(boxcox(x_i)[1]) else: raise NotImplementedError
def transform_data_to_gaussian_1D(feature_vector): """ Takes not-necessarily any distributed data and transforms it to a gaussian distribution using the box-cox transform """ import matplotlib.pyplot as plt x=feature_vector n=len(x) new_x,l=stats.boxcox(feature_vector) return new_x
def boxcox(x,y,y_label): box_cox, maxlog = stats.boxcox(y + abs(min(y)) + 1) regr.fit(x,box_cox) box_cox_predict = regr.predict(x) y_predict = inv_boxcox(box_cox_predict,maxlog) - abs(min(y)) - 1 print "R squared: " + str(np.var(y_predict)/np.var(y)) # Plot outputs fig = plt.figure() plt.scatter(y, y_predict, color='blue') plt.xlabel(y_label) plt.ylabel('predicted') plt.show()
def box_cox(df, lmbda=None, alpha=None): """ Performs a Box-Cox Transformation on all columns (features) of a pandas dataframe. Currently, there is some ambiguity as to how to deal with non-positive values & I need to check this out: at the moment, I just centre the data so that min(value) > 0, for all features, as necessitated by the very nature of the Box-Cox Transformation. """ df_tr = pd.DataFrame(columns=df.columns) #initialize empty data frame with same features as df for val in list(df.columns): df_tr[val] = stats.boxcox(df[val] - min(df[val]) + 0.1,lmbda, alpha)[0] #populate dataframe with transformed data return df_tr
def fit(self, X): xtrans = numpy.zeros(shape=X.shape) if len(X.shape) == 2: self.shift = -X.min(axis=0) self.shift[self.shift < 0] = 0 self.shift += 3 * X.std(axis=0) X += self.shift self.lmbda = numpy.zeros(X.shape[1]) for j in range(X.shape[1]): _, self.lmbda[j] = boxcox(X[:, j]) self.lmbda[j] = max(self.lmbda[j], self.minlmbda) self.lmbda[j] = min(self.lmbda[j], self.maxlmbda) if numpy.abs(self.lmbda[j]) < 1e-4: self.lmbda[j] = 0 print "changing lambda" xtrans[:, j] = boxcox(X[:, j], self.lmbda[j]) elif len(X.shape) == 1: self.shift = max([1e-10, -X.min()]) self.shift += 3 * X.std() X += self.shift xtrans, self.lmbda = boxcox(X) self.xmean = xtrans.mean(axis=0) self.xstd = xtrans.std(axis=0)
def append_boxcox(data, cols, drop_old=False): """Apply boxcox transformations to a list of columns data: a pandas DataFrame cols: a list of column names for which to perform boxcox transformations """ if isinstance(cols, basestring): cols = [cols] for col in cols: # boxcox also returns maxlog, the lambda param that is choosen # could be used for pipelining objects data[col + '_boxcox'] = stats.boxcox(data[col])[0] if drop_old: data.drop(col, axis=1, inplace=True)
def mungeskewed(train, test, numeric_feats): ntrain = train.shape[0] test['loss'] = 0 train_test = pd.concat((train, test)).reset_index(drop=True) # compute skew and do Box-Cox transformation (Tilli) skewed_feats = train[numeric_feats].apply(lambda x: skew(x.dropna())) print("\nSkew in numeric features:") print(skewed_feats) skewed_feats = skewed_feats[skewed_feats > 0.25] skewed_feats = skewed_feats.index for feats in skewed_feats: train_test[feats] = train_test[feats] + 1 train_test[feats], lam = boxcox(train_test[feats]) return train_test, ntrain
def transform_features(x_train, x_test): """ Transform features using a boxcox transform. Remove vibrato features. Comptes the optimal value of lambda on the training set and applies this lambda to the testing set. Parameters ---------- x_train : np.array [n_samples, n_features] Untransformed training features. x_test : np.array [n_samples, n_features] Untransformed testing features. Returns ------- x_train_boxcox : np.array [n_samples, n_features_trans] Transformed training features. x_test_boxcox : np.array [n_samples, n_features_trans] Transformed testing features. """ x_train = x_train[:, 0:6] x_test = x_test[:, 0:6] _, n_feats = x_train.shape x_train_boxcox = np.zeros(x_train.shape) lmbda_opt = np.zeros((n_feats,)) eps = 1.0 # shift features away from zero for i in range(n_feats): x_train_boxcox[:, i], lmbda_opt[i] = boxcox(x_train[:, i] + eps) x_test_boxcox = np.zeros(x_test.shape) for i in range(n_feats): x_test_boxcox[:, i] = boxcox(x_test[:, i] + eps, lmbda=lmbda_opt[i]) return x_train_boxcox, x_test_boxcox
def boxcox(X): """ Gaussianize X using the Box-Cox transformation: [samples x phenotypes] - each phentoype is brought to a positive schale, by first subtracting the minimum value and adding 1. - Then each phenotype transformed by the boxcox transformation """ X_transformed = sp.zeros_like(X) maxlog = sp.zeros(X.shape[1]) for i in range(X.shape[1]): i_nan = sp.isnan(X[:,i]) values = X[~i_nan,i] X_transformed[i_nan,i] = X[i_nan,i] X_transformed[~i_nan,i], maxlog[i] = st.boxcox(values-values.min()+1.0) return X_transformed, maxlog
def preprocess_feature(cls, feature, parameters): is_not_empty = 1 - np.isclose(feature, MISSING_VALUE) if parameters.feature_type == identify_types.BINARY: # Binary features are always 1 unless they are 0 return ((feature != 0) * is_not_empty).astype(np.float32) if parameters.boxcox_lambda is not None: feature = stats.boxcox( np.maximum(feature + parameters.boxcox_shift, BOX_COX_MARGIN), parameters.boxcox_lambda, ) # No *= to ensure consistent out-of-place operation. if parameters.feature_type == identify_types.PROBABILITY: feature = np.clip(feature, 0.01, 0.99) feature = special.logit(feature) elif parameters.feature_type == identify_types.QUANTILE: transformed_feature = np.zeros_like(feature) for i in six.moves.range(feature.shape[0]): transformed_feature[i] = cls.value_to_quantile( feature[i], parameters.quantiles ) feature = transformed_feature elif parameters.feature_type == identify_types.ENUM: possible_values = parameters.possible_values mapping = {} for i, possible_value in enumerate(possible_values): mapping[possible_value] = i output_feature = np.zeros((len(feature), len(possible_values))) for i, val in enumerate(feature): if abs(val - MISSING_VALUE) < 1e-2: # This check is required by the PT preprocessing but not C2 continue output_feature[i][mapping[val]] = 1.0 return output_feature elif parameters.feature_type == identify_types.CONTINUOUS_ACTION: min_value = parameters.min_value max_value = parameters.max_value feature = ( (feature - min_value) * ((1 - 1e-6) * 2 / (max_value - min_value)) - 1 + 1e-6 ) else: feature = feature - parameters.mean feature /= parameters.stddev feature = np.clip(feature, MIN_FEATURE_VALUE, MAX_FEATURE_VALUE) feature *= is_not_empty return feature
def transform(self, x): x = np.asarray(x) if len(x.shape) == 1: x = x[:, np.newaxis] elif len(x.shape) != 2: print "Data should be a 1-d list of samples to transform or a 2d array with samples as rows." if x.shape[1] != len(self.taus): print "%d variables in test data, but %d variables were in training data." % (x.shape[1], len(self.taus)) if self.strategy == 'lambert': return np.array([w_t(x_i, tau_i) for x_i, tau_i in zip(x.T, self.taus)]).T elif self.strategy == 'brute': return np.array([norm.ppf((rankdata(x_i) - 0.5) / len(x_i)) for x_i in x.T]).T elif self.strategy == 'boxcox': return np.array([boxcox(x_i, lmbda=lmbda_i) for x_i, lmbda_i in zip(x.T, self.taus)]).T else: raise NotImplementedError
def fit(self, x): x = np.asarray(x) if len(x.shape) == 1: x = x[:, np.newaxis] elif len(x.shape) != 2: print "Data should be a 1-d list of samples to transform or a 2d array with samples as rows." if self.strategy == 'lambert': for x_i in x.T: self.taus.append(igmm(x_i, tol=self.tol, max_iter=self.max_iter)) elif self.strategy == 'brute': for x_i in x.T: self.taus.append(None) # TODO: In principle, we could store parameters to do a quasi-invert elif self.strategy == 'boxcox': for x_i in x.T: self.taus.append(boxcox(x_i)[1]) else: raise NotImplementedError
def Preprocess_TransformNumericFeatures(self, dfall, trans_type ='boxcox', correction=0.00001): if self.num_features is None: raise TypeError("Execute the SetUpTrainTest method to use this feature") return if trans_type not in ['boxcox']: raise TypeError("Transformation type not supported") return self.lmbdaDict = {} for c in self.num_features: print 'Applying', trans_type + 'transformation on:', c if trans_type == 'boxcox': b = stats.boxcox(dfall[c]+ correction) dfall[c] = b[0] self.lmbdaDict[c]=b[1] return dfall
def _estimate_lambda_single_y(y): """Estimate lambda for a single y, given a range of lambdas through which to search. No validation performed. Parameters ---------- y : ndarray, shape (n_samples,) The vector being estimated against """ # ensure is array y = np.array(y) # Use scipy's log-likelihood estimator b = boxcox(y, lmbda=None) # Return lambda corresponding to maximum P return b[1]
def fit(self, x, y=None): """Fit a Gaussianizing transformation to each variable/column in x.""" x = np.asarray(x) if len(x.shape) == 1: x = x[:, np.newaxis] elif len(x.shape) != 2: print("Data should be a 1-d list of samples to transform or a 2d array with samples as rows.") if self.strategy == 'lambert': if self.verbose: print("Gaussianizing with Lambert method") for x_i in x.T: self.coefs_.append(igmm(x_i, tol=self.tol, max_iter=self.max_iter)) elif self.strategy == 'brute': for x_i in x.T: self.coefs_.append(None) # TODO: In principle, we could store parameters to do a quasi-invert elif self.strategy == 'boxcox': for x_i in x.T: self.coefs_.append(boxcox(x_i)[1]) else: raise NotImplementedError return self
def boxcox_xform(X, scaling=True): """ robust version of boxcox transform. Handles negative data and very large values in the original data. :param X: data (numeric list, Pandas series or 1d np array) :param scaling: whether to normalize between 0 and 1 or not :return: Boxcox transform array, the abs(max value of the original data set), and the optimal lbda parameter, fp where fp = 'N' if the data has negative values and fp = 'P' if the data does not have negative values """ x_arr = np.array(list(X)) x_max = np.max(np.abs(x_arr)) if scaling is True else 1.0 if len(np.unique(x_arr)) > 0: if np.min(x_arr) <= 0.0: # shift and rescale print('use YJ transform: yj_xform(X)') return None else: # only positive values z = x_arr / x_max # scale to deal with overflow/underflow: values in (0, 1] y, lbda = sps.boxcox(z, lmbda=None, alpha=None) # lbda = _boxcox_opt(z) # y = _boxcox_xform(lbda, x_arr) return y, x_max, lbda else: print('boxcox_xform: no data') return None, None, None, None
popular_artist=df.groupby('artist_name').size() print(popular_artist) artist_list=df['artist_name'].values.tolist() df.isnull().sum() df.fillna(0) pd.set_option('precision', 3) df.describe() #Finding out the skew for each attribute skew=df.skew() print(skew) # Removing the skew by using the boxcox transformations transform=np.asarray(df[['Liveness']].values) df_transform = stats.boxcox(transform)[0] # Plotting a histogram to show the difference plt.hist(df['Liveness'],bins=10) #original data plt.show() plt.hist(df_transform,bins=10) #corrected skew data plt.show() transform1=np.asarray(df[['Popularity']].values) df_transform1 = stats.boxcox(transform1)[0] # Plotting a histogram to show the difference # plt.hist(df['Popularity'],bins=10) original data # plt.show() # plt.hist(df_transform1,bins=10) #corrected skew data # plt.show() sns.distplot(df['Popularity'],bins=10,kde=True,kde_kws={"color": "k", "lw": 2, "label": "KDE"},color='yellow') plt.show()
def fit(self, smoothing_level=None, smoothing_slope=None, smoothing_seasonal=None, damping_slope=None, optimized=True, use_boxcox=False, remove_bias=False, use_basinhopping=False, start_params=None, initial_level=None, initial_slope=None, use_brute=True): """ Fit the model Parameters ---------- smoothing_level : float, optional The alpha value of the simple exponential smoothing, if the value is set then this value will be used as the value. smoothing_slope : float, optional The beta value of the Holt's trend method, if the value is set then this value will be used as the value. smoothing_seasonal : float, optional The gamma value of the holt winters seasonal method, if the value is set then this value will be used as the value. damping_slope : float, optional The phi value of the damped method, if the value is set then this value will be used as the value. optimized : bool, optional Estimate model parameters by maximizing the log-likelihood use_boxcox : {True, False, 'log', float}, optional Should the Box-Cox transform be applied to the data first? If 'log' then apply the log. If float then use lambda equal to float. remove_bias : bool, optional Remove bias from forecast values and fitted values by enforcing that the average residual is equal to zero. use_basinhopping : bool, optional Using Basin Hopping optimizer to find optimal values start_params: array, optional Starting values to used when optimizing the fit. If not provided, starting values are determined using a combination of grid search and reasonable values based on the initial values of the data initial_level: float, optional Value to use when initializing the fitted level. initial_slope: float, optional Value to use when initializing the fitted slope. use_brute: bool, optional Search for good starting values using a brute force (grid) optimizer. If False, a naive set of starting values is used. Returns ------- results : HoltWintersResults class See statsmodels.tsa.holtwinters.HoltWintersResults Notes ----- This is a full implementation of the holt winters exponential smoothing as per [1]. This includes all the unstable methods as well as the stable methods. The implementation of the library covers the functionality of the R library as much as possible whilst still being Pythonic. References ---------- [1] Hyndman, Rob J., and George Athanasopoulos. Forecasting: principles and practice. OTexts, 2014. """ # Variable renames to alpha,beta, etc as this helps with following the # mathematical notation in general alpha = smoothing_level beta = smoothing_slope gamma = smoothing_seasonal phi = damping_slope l0 = self._l0 = initial_level b0 = self._b0 = initial_slope data = self.endog damped = self.damped seasoning = self.seasoning trending = self.trending trend = self.trend seasonal = self.seasonal m = self.seasonal_periods opt = None phi = phi if damped else 1.0 if use_boxcox == 'log': lamda = 0.0 y = boxcox(data, lamda) elif isinstance(use_boxcox, float): lamda = use_boxcox y = boxcox(data, lamda) elif use_boxcox: y, lamda = boxcox(data) else: lamda = None y = data.squeeze() if np.ndim(y) != 1: raise ValueError('Only 1 dimensional data supported') self._y = y lvls = np.zeros(self.nobs) b = np.zeros(self.nobs) s = np.zeros(self.nobs + m - 1) p = np.zeros(6 + m) max_seen = np.finfo(np.double).max l0, b0, s0 = self.initial_values() xi = np.zeros_like(p, dtype=np.bool) if optimized: init_alpha = alpha if alpha is not None else 0.5 / max(m, 1) init_beta = beta if beta is not None else 0.1 * init_alpha if trending else beta init_gamma = None init_phi = phi if phi is not None else 0.99 # Selection of functions to optimize for appropriate parameters if seasoning: init_gamma = gamma if gamma is not None else 0.05 * \ (1 - init_alpha) xi = np.array([ alpha is None, trending and beta is None, gamma is None, initial_level is None, trending and initial_slope is None, phi is None and damped ] + [True] * m) func = SMOOTHERS[(seasonal, trend)] elif trending: xi = np.array([ alpha is None, beta is None, False, initial_level is None, initial_slope is None, phi is None and damped ] + [False] * m) func = SMOOTHERS[(None, trend)] else: xi = np.array([ alpha is None, False, False, initial_level is None, False, False ] + [False] * m) func = SMOOTHERS[(None, None)] p[:] = [init_alpha, init_beta, init_gamma, l0, b0, init_phi] + s0 if np.any(xi): # txi [alpha, beta, gamma, l0, b0, phi, s0,..,s_(m-1)] # Have a quick look in the region for a good starting place for alpha etc. # using guesstimates for the levels txi = xi & np.array([True, True, True, False, False, True] + [False] * m) txi = txi.astype(np.bool) bounds = np.array([(0.0, 1.0), (0.0, 1.0), (0.0, 1.0), (0.0, None), (0.0, None), (0.0, 1.0)] + [ (None, None), ] * m) args = (txi.astype(np.uint8), p, y, lvls, b, s, m, self.nobs, max_seen) if start_params is None and np.any(txi) and use_brute: res = brute(func, bounds[txi], args, Ns=20, full_output=True, finish=None) p[txi], max_seen, _, _ = res else: if start_params is not None: start_params = np.atleast_1d(np.squeeze(start_params)) if len(start_params) != xi.sum(): raise ValueError( 'start_params must have {0} values but ' 'has {1} instead'.format( len(xi), len(start_params))) p[xi] = start_params args = (xi.astype(np.uint8), p, y, lvls, b, s, m, self.nobs, max_seen) max_seen = func(np.ascontiguousarray(p[xi]), *args) # alpha, beta, gamma, l0, b0, phi = p[:6] # s0 = p[6:] # bounds = np.array([(0.0,1.0),(0.0,1.0),(0.0,1.0),(0.0,None), # (0.0,None),(0.8,1.0)] + [(None,None),]*m) args = (xi.astype(np.uint8), p, y, lvls, b, s, m, self.nobs, max_seen) if use_basinhopping: # Take a deeper look in the local minimum we are in to find the best # solution to parameters, maybe hop around to try escape the local # minimum we may be in. res = basinhopping(func, p[xi], minimizer_kwargs={ 'args': args, 'bounds': bounds[xi] }, stepsize=0.01) success = res.lowest_optimization_result.success else: # Take a deeper look in the local minimum we are in to find the best # solution to parameters res = minimize(func, p[xi], args=args, bounds=bounds[xi]) success = res.success if not success: from warnings import warn from statsmodels.tools.sm_exceptions import ConvergenceWarning warn("Optimization failed to converge. Check mle_retvals.", ConvergenceWarning) p[xi] = res.x opt = res else: from warnings import warn from statsmodels.tools.sm_exceptions import EstimationWarning message = "Model has no free parameters to estimate. Set " \ "optimized=False to suppress this warning" warn(message, EstimationWarning) [alpha, beta, gamma, l0, b0, phi] = p[:6] s0 = p[6:] hwfit = self._predict(h=0, smoothing_level=alpha, smoothing_slope=beta, smoothing_seasonal=gamma, damping_slope=phi, initial_level=l0, initial_slope=b0, initial_seasons=s0, use_boxcox=use_boxcox, remove_bias=remove_bias, is_optimized=xi) hwfit._results.mle_retvals = opt return hwfit
wine = pd.read_csv(join(path_to_data_folder, 'monthly-australian-wine-sales.csv'), ',', index_col=['month'], parse_dates=['month'], dayfirst=True) wine.sales = wine.sales * 1000 wine.sales.plot(title="Diki: " + str(diki(wine.sales))) print('Diki wine: ', diki(wine.sales)) # sm.tsa.seasonal_decompose(wine.sales).plot() # plt.show() wine['sales_box'], lmbda = stats.boxcox(wine.sales) wine.sales_box.plot(title="Diki: " + str(diki(wine.sales_box))) # plt.show() plt.ylabel(u'Transformed wine sales') print("Оптимальный параметр преобразования Бокса-Кокса: %f" % lmbda) print("Критерий Дики-Фуллера: p=%f" % diki(wine.sales_box)) wine['sales_box_diff12'] = wine.sales_box - wine.sales_box.shift(12) wine.sales_box_diff12.dropna(inplace=True) wine.sales_box_diff12.plot(title="sales_box_diff12, diki: " + str(diki(wine.sales_box_diff12))) sm.tsa.seasonal_decompose(wine.sales_box_diff12).plot() # plt.show() wine['sales_box_diff1'] = wine.sales_box_diff12 - wine.sales_box_diff12.shift(
data_1= data_1[data_1.loan_status != 'Late (31-120 days)'] data_1 = data_1[data_1.loan_status != 'Issued'] data_1['loan_status'] = data_1['loan_status'].replace({'Charged Off':'Default'}) data_1['loan_status'] = data_1['loan_status'].replace({'In Grace Period':'Default'}) data_1.loan_status=data_1.loan_status.astype('category').cat.codes data_1.delinq_2yrs=data_1.delinq_2yrs.astype('category').cat.codes #print(data_1['loan_status'].unique()) #print(data_1['loan_status'].value_counts()) numerical = data_1.columns[data_1.dtypes == 'float64'] for i in numerical: if data_1[i].min() > 0: transformed, lamb = boxcox(data_1.loc[data_1[i].notnull(), i]) if np.abs(1 - lamb) > 0.02: data_1.loc[data_1[i].notnull(), i] = transformed ### Spliting the data in 2 for trains and testing data_1 = pd.get_dummies(data_1, drop_first=True) #ros = RandomOverSampler(random_state=0) traindata, testdata = train_test_split(data_1,stratify=data_1['loan_status'],test_size=.2) testdata.reset_index(drop=True, inplace=True) traindata.reset_index(drop=True, inplace=True) sc=StandardScaler()
v in values_for_variable if v is not None]) n_outliers_removed = 0 #if remove_large_outliers: # percentile_99 = np.percentile(nn_values_for_variable, 97) # outlier_indices = nn_values_for_variable > percentile_99 # n_outliers_removed = np.count_nonzero(outlier_indices) # nn_values_for_variable = nn_values_for_variable[outlier_indices == False] nn_values_for_variable = np.asarray(nn_values_for_variable) alpha = np.min(nn_values_for_variable) nn_values_for_variable -= alpha lmbda = boxcox_normmax(nn_values_for_variable + BOXCOX_A, method='mle') nn_values_for_variable = boxcox(nn_values_for_variable + BOXCOX_A, lmbda=lmbda) ind = (all_values_for_variable != None) all_values_for_variable[ind] = nn_values_for_variable #beta = np.std(nn_values_for_variable) #nn_values_for_variable /= beta #delta = np.mean(nn_values_for_variable) #nn_values_for_variable -= delta all_values_for_variables.append(all_values_for_variable) positive_values_for_variables.append(nn_values_for_variable) scaling_parameters.append([lmbda, alpha, 1, 0]) n_unique_values = len(np.unique(nn_values_for_variable)) n_not_missing = len(nn_values_for_variable) n_total_samples = len(values_for_variable)
# plot_line(df, "date", "daily_sign_ups") """ box-Cox transforms are data transformations that evaluate a set of lambda coefficients (λ) and selects the value that achieves the best approximation of normality the boxcox method returns a positive dataset transformed by a Box-Cox power transformation the boxcox method has one required input: a 1-dimensional array of positive data to transform you can also specify the λ value you’d like to use for your transformation (e.g. λ = 0 for a log transform) otherwise, the boxcox method will find the λ that maximizes the log-likelihood function and will return it as the second output argument """ # Apply Box-Cox Transform to value column and assign to new column y df["y"], lam = boxcox(df.daily_sign_ups) # plot daily signups and boxcox transformation fig = plt.figure(dpi=300, figsize=(6, 4)) ax1 = plt.subplot(311) plt.plot(df.date, df.daily_sign_ups, "b") plt.setp(ax1.get_xticklabels(), fontsize=6) ax2 = plt.subplot(312, sharex=ax1) # share x only plt.plot(df.date, df.y, "g") plt.setp(ax2.get_xticklabels(), visible=False) # make these tick labels invisible plt.show() # instantiating (create an instance of) a Prophet object m = Prophet()
squeeze=True) y = validation.values.astype('float32') # load model model_fit = ARIMAResults.load('model.pkl') lam = numpy.load('model_lambda.npy') # make first prediction predictions = list() yhat = model_fit.forecast()[0] yhat = boxcox_inverse(yhat, lam) predictions.append(yhat) history.append(y[0]) print('>Predicted=%.3f, Expected=%3.f' % (yhat, y[0])) # rolling forecasts for i in range(1, len(y)): # transform transformed, lam = boxcox(history) if lam < -5: transformed, lam = history, 1 # predict model = ARIMA(transformed, order=(0, 1, 2)) model_fit = model.fit(disp=0) yhat = model_fit.forecast()[0] # invert transformed prediction yhat = boxcox_inverse(yhat, lam) predictions.append(yhat) # observation obs = y[i] history.append(obs) print('>Predicted=%.3f, Expected=%3.f' % (yhat, obs)) # report performance rmse = sqrt(mean_squared_error(y, predictions))
# Box-Cox transform import pandas from scipy.stats import boxcox url = "https://archive.ics.uci.edu/ml/machine-learning-databases/pima-indians-diabetes/pima-indians-diabetes.data" names = [ 'preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class' ] dataframe = pandas.read_csv(url, names=names) array = dataframe.values X = array[:, 0:8] Y = array[:, 8] X_boxcox = boxcox(1 + X[:, 2])[0] print(X_boxcox)
train_test[i]=np.log10(train_test[i]+10) ''' ## numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64'] numeric_feats=train_test.select_dtypes(include=numerics).columns skewed_feats = train_test[numeric_feats].apply(lambda x: skew(x.dropna())) skewed_feats = skewed_feats[skewed_feats > 0.2] skewed_feats = skewed_feats.index for feats in skewed_feats: train_test[feats] = train_test[feats] + 1 train_test[feats], lam = boxcox(train_test[feats]) ## train_test=train_test.drop(['pymnt_plan','verification_status_joint','application_type','title','batch_enrolled'],axis=1) print('Label Encoding') cat=['object'] train_test.select_dtypes(include=cat) cat_col=train_test.select_dtypes(include=cat).columns ''' rm=['emp_length','zip_code','last_week_pay'] cat_col=[c for c in cat_col if c not in rm] cat_col
PS. λ是待定变换参数,一般为λ=0,1/2,-1 ,scipy.stats默认λ是None ''' with tqdm(total=train_num_encode.shape[1],desc='Transforming',unit='cols') as pbar: # 迭代每一个数值型特征 for col in range(train_num_encode.shape[1]): # 获取特征下所有数据(包括测试集和训练集) values = np.hstack((train_num[:,col],test_num[:,col])) # 获取该特征下所有数据的不对称度程度 sk = skew(values) if sk > 0.25: # box-cox处理 values_enc,lam = boxcox(values+1) train_num_encode[:,col] = values_enc[:train_num.shape[0]] test_num_encode[:,col] = values_enc[train_num.shape[0]:] else: # 不处理 train_num_encode[:,col] = train_num[:,col] test_num_encode[:,col] = test_num[:,col] pbar.update(1) print('saving...') # 保存特征 Dataset.save_part_feature('numeric_boxcox',Dataset.get_part_feature('numeric')) # 保存数据
def plot_df_transformed(df: pd.DataFrame, filters: tuple, zero_values: str = "ignore") -> None: """ Plots three graphs for each variable satisfying the filters; original distribution, transformed distribution and transformed probability plot. :param df: supplied dataframe :param zero_values: either "ignore" or "increment" :param filters: tuple containing filter strings for columns :return: """ to_plot = filtered_col_list(df, strings=filters) height = ceil(len(to_plot) * 10 / 3) fig = plt.figure(figsize=(12, height)) count = 0 for var in to_plot: try: if zero_values == "ignore": series = pd.Series(df[np.abs(df[var]) > 0][var]) elif zero_values == "increment": series = pd.Series(df[var] + 0.0001) ax1 = plt.subplot2grid((len(to_plot), 3), (count, 0), colspan=1, rowspan=1) ax1.get_xaxis().set_major_formatter( FuncFormatter(lambda x, p: format(int(x), ','))) plt.yticks([], []) sns.distplot(series, fit=norm) plt.title("Distribution: Original Data") plt.ylabel("Density") series_transformed = boxcox(np.abs(np.asarray(series.values)))[0] ax2 = plt.subplot2grid((len(to_plot), 3), (count, 1), rowspan=1, colspan=1) ax2.get_xaxis().set_major_formatter( FuncFormatter(lambda x, p: format(int(x), ','))) sns.distplot(series_transformed, fit=norm) plt.xlabel(var) plt.title("Distribution: Boxcox Data") plt.yticks([], []) plt.ylabel("Density") ax3 = plt.subplot2grid((len(to_plot), 3), (count, 2), colspan=1, rowspan=1) ax3.get_yaxis().set_major_formatter( FuncFormatter(lambda x, p: format(int(x), ','))) stats.probplot(series_transformed, plot=ax3) plt.title("Probability Plot: Boxcox Data") plt.xlabel(var) plt.yticks([], []) except Exception as e: print(var, e) count += 1 plt.subplots_adjust(hspace=0.2) plt.tight_layout() plt.show()
def boxcoxtransform(*column_names, add=0): for colname in column_names: bc_xform_values, bc_lambda = stats.boxcox(super_df[colname] + add) print('BoxCox Transform ', colname, ' with lambda: ', bc_lambda) for df in combine_df: df[colname] = stats.boxcox(df[colname] + add, bc_lambda)
def feature_engineering(self): # combine train and test datas in to one dataframe df_all = pd.concat([self.df_csv_train, self.df_csv_test]) print('train.shape=', self.df_csv_train.shape, ', test.shape=', self.df_csv_test.shape) cols_with_na = ProcessData.get_cols_with_na( df_all.drop('SalePrice', axis=1)) print(cols_with_na.sort_values(ascending=False).to_string()) # 1.Meaningful NA Values ######### # columns where NA values have meaning, e.g. no pool, no basement, etc. cols_fillna = [ 'PoolQC', 'MiscFeature', 'Alley', 'Fence', 'MasVnrType', 'FireplaceQu', 'GarageQual', 'GarageCond', 'GarageFinish', 'GarageType', 'BsmtExposure', 'BsmtCond', 'BsmtQual', 'BsmtFinType1', 'BsmtFinType2' ] # replace 'NA' with 'None' in these columns for col in cols_fillna: df_all[col].fillna('None', inplace=True) # GarageYrBlt nans: no garage. Fill with property YearBuilt. # (more appropriate than 0, which would be ~2000 away from all other values) df_all.loc[df_all.GarageYrBlt.isnull(), 'GarageYrBlt'] = df_all.loc[df_all.GarageYrBlt.isnull(), 'YearBuilt'] # No masonry veneer - fill area with 0 df_all.MasVnrArea.fillna(0, inplace=True) # No basement - fill areas/counts with 0 df_all.BsmtFullBath.fillna(0, inplace=True) df_all.BsmtHalfBath.fillna(0, inplace=True) df_all.BsmtFinSF1.fillna(0, inplace=True) df_all.BsmtFinSF2.fillna(0, inplace=True) df_all.BsmtUnfSF.fillna(0, inplace=True) df_all.TotalBsmtSF.fillna(0, inplace=True) # No garage - fill areas/counts with 0 df_all.GarageArea.fillna(0, inplace=True) df_all.GarageCars.fillna(0, inplace=True) # 2.LotFrontage NA Values ######### # LotFrontage # fill NA values using a linear regressor # convert categoricals to dummies, exclude SalePrice from model df_frontage = pd.get_dummies(df_all.drop('SalePrice', axis=1)) # normalise columns to 0-1 for col in df_frontage.drop('LotFrontage', axis=1).columns: df_frontage[col] = ProcessData.scale_minmax(df_frontage[col]) lf_train = df_frontage.dropna() lf_train_y = lf_train.LotFrontage lf_train_X = lf_train.drop('LotFrontage', axis=1) # fit model lr = Ridge() lr.fit(lf_train_X, lf_train_y) # check model results lr_coefs = pd.Series(lr.coef_, index=lf_train_X.columns) print('----------------') print('Intercept:', lr.intercept_) print('----------------coefficient: head(10)') print(lr_coefs.sort_values(ascending=False).head(10)) print('----------------coefficient: tail(10)') print(lr_coefs.sort_values(ascending=False).tail(10)) print('----------------') print('R2:', lr.score(lf_train_X, lf_train_y)) print('----------------') # fill na values using model predictions na_frontage = df_all.LotFrontage.isnull() X = df_frontage[na_frontage].drop('LotFrontage', axis=1) y = lr.predict(X) # fill na values df_all.loc[na_frontage, 'LotFrontage'] = y # 3.Remaining NaNs ######### print(cols_with_na.sort_values(ascending=False).to_string()) rows_with_na = df_all.drop('SalePrice', axis=1).isnull().sum(axis=1) rows_with_na = rows_with_na[rows_with_na > 0] print(rows_with_na.sort_values(ascending=False).to_string()) # fill remaining NA with mode in that column for col in cols_with_na.index: df_all[col].fillna(df_all[col].mode()[0], inplace=True) # Now no more NaN values df_all.info() # 4.Basement Finish Types ######### # create separate columns for area of each possible # basement finish type bsmt_fin_cols = ['BsmtGLQ', 'BsmtALQ', 'BsmtBLQ', 'BsmtRec', 'BsmtLwQ'] for col in bsmt_fin_cols: # initialise as columns of zeros df_all[col + 'SF'] = 0 # fill remaining finish type columns for row in df_all.index: fin1 = df_all.loc[row, 'BsmtFinType1'] if (fin1 != 'None') and (fin1 != 'Unf'): # add area (SF) to appropriate column df_all.loc[row, 'Bsmt' + fin1 + 'SF'] += df_all.loc[row, 'BsmtFinSF1'] fin2 = df_all.loc[row, 'BsmtFinType2'] if (fin2 != 'None') and (fin2 != 'Unf'): df_all.loc[row, 'Bsmt' + fin2 + 'SF'] += df_all.loc[row, 'BsmtFinSF2'] # remove initial BsmtFin columns df_all.drop( ['BsmtFinType1', 'BsmtFinSF1', 'BsmtFinType2', 'BsmtFinSF2'], axis=1, inplace=True) # already have BsmtUnf column in dataset bsmt_fin_cols.append('BsmtUnf') # also create features representing the fraction of the basement that is each finish type for col in bsmt_fin_cols: df_all[col + 'Frac'] = df_all[col + 'SF'] / df_all['TotalBsmtSF'] # replace any NA with zero (for properties without a basement) df_all[col + 'Frac'].fillna(0, inplace=True) # 5.1st and 2nd Floor Area ######### df_all['LowQualFinFrac'] = df_all['LowQualFinSF'] / df_all['GrLivArea'] df_all['1stFlrFrac'] = df_all['1stFlrSF'] / df_all['GrLivArea'] df_all['2ndFlrFrac'] = df_all['2ndFlrSF'] / df_all['GrLivArea'] df_all['TotalAreaSF'] = df_all['GrLivArea'] + df_all['TotalBsmtSF'] + df_all['GarageArea'] + df_all[ 'EnclosedPorch'] + \ df_all['ScreenPorch'] df_all['LivingAreaSF'] = df_all['1stFlrSF'] + df_all['2ndFlrSF'] + df_all['BsmtGLQSF'] + df_all['BsmtALQSF'] + \ df_all[ 'BsmtBLQSF'] df_all['StorageAreaSF'] = df_all['LowQualFinSF'] + df_all['BsmtRecSF'] + df_all['BsmtLwQSF'] + df_all[ 'BsmtUnfSF'] + \ df_all['GarageArea'] # 6.Categorical Features with Meaningful Ordering ######### # convert some categorical values to numeric scales # Excellent, Good, Typical, Fair, Poor, None: Convert to 0-5 scale cols_ExGd = [ 'ExterQual', 'ExterCond', 'BsmtQual', 'BsmtCond', 'HeatingQC', 'KitchenQual', 'FireplaceQu', 'GarageQual', 'GarageCond', 'PoolQC' ] dict_ExGd = {'Ex': 5, 'Gd': 4, 'TA': 3, 'Fa': 2, 'Po': 1, 'None': 0} for col in cols_ExGd: df_all[col].replace(dict_ExGd, inplace=True) print(df_all[cols_ExGd].head(5)) # Remaining columns df_all['BsmtExposure'].replace( { 'Gd': 4, 'Av': 3, 'Mn': 2, 'No': 1, 'None': 0 }, inplace=True) df_all['CentralAir'].replace({'Y': 1, 'N': 0}, inplace=True) df_all['Functional'].replace( { 'Typ': 7, 'Min1': 6, 'Min2': 5, 'Mod': 4, 'Maj1': 3, 'Maj2': 2, 'Sev': 1, 'Sal': 0 }, inplace=True) df_all['GarageFinish'].replace( { 'Fin': 3, 'RFn': 2, 'Unf': 1, 'None': 0 }, inplace=True) df_all['LotShape'].replace({ 'Reg': 3, 'IR1': 2, 'IR2': 1, 'IR3': 0 }, inplace=True) df_all['Utilities'].replace( { 'AllPub': 3, 'NoSewr': 2, 'NoSeWa': 1, 'ELO': 0 }, inplace=True) df_all['LandSlope'].replace({ 'Gtl': 2, 'Mod': 1, 'Sev': 0 }, inplace=True) # 7.Dealing with Zeros ######### # fraction of zeros in each column frac_zeros = ((df_all == 0).sum() / len(df_all)) # no. unique values in each column n_unique = df_all.nunique() # difference between frac. zeros and expected # frac. zeros if values evenly distributed between # classes xs_zeros = frac_zeros - 1 / n_unique # create dataframe and display which columns may be problematic zero_cols = pd.DataFrame({ 'frac_zeros': frac_zeros, 'n_unique': n_unique, 'xs_zeros': xs_zeros }) zero_cols = zero_cols[zero_cols.frac_zeros > 0] zero_cols.sort_values(by='xs_zeros', ascending=False, inplace=True) print(zero_cols[(zero_cols.xs_zeros > 0)]) # very few properties with Pool or 3SsnPorch # replace columns with binary indicator df_all['HasPool'] = (df_all['PoolQC'] > 0).astype(int) df_all['Has3SsnPorch'] = (df_all['3SsnPorch'] > 0).astype(int) df_all.drop(['PoolQC', 'PoolArea', '3SsnPorch'], axis=1, inplace=True) # 'half' bathrooms - add half value to 'full' bathrooms df_all['BsmtFullBath'] = df_all[ 'BsmtFullBath'] + 0.5 * df_all['BsmtHalfBath'] df_all['FullBath'] = df_all['FullBath'] + 0.5 * df_all['HalfBath'] df_all.drop(['BsmtHalfBath', 'HalfBath'], axis=1, inplace=True) # create additional dummy variable for # continuous variables with a lot of zeros dummy_cols = [ 'LowQualFinSF', '2ndFlrSF', 'MiscVal', 'ScreenPorch', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', 'MasVnrArea', 'GarageArea', 'Fireplaces', 'BsmtGLQSF', 'BsmtALQSF', 'BsmtBLQSF', 'BsmtRecSF', 'BsmtLwQSF', 'BsmtUnfSF', 'TotalBsmtSF' ] for col in dummy_cols: df_all['Has' + col] = (df_all[col] > 0).astype(int) # 8.Log Transform SalePrice ######### # Log Transform SalePrice to improve normality sp = df_all.SalePrice df_all.SalePrice = np.log(sp) print(df_all.SalePrice.describe()) # 9.Identify Types of Features ######### # extract names of numeric columns dtypes = df_all.dtypes cols_numeric = dtypes[dtypes != object].index.tolist() # MSubClass should be treated as categorical cols_numeric.remove('MSSubClass') # choose any numeric column with less than 13 values to be # "discrete". 13 chosen to include months of the year. # other columns "continuous" col_nunique = dict() for col in cols_numeric: col_nunique[col] = df_all[col].nunique() col_nunique = pd.Series(col_nunique) cols_discrete = col_nunique[col_nunique < 13].index.tolist() cols_continuous = col_nunique[col_nunique >= 13].index.tolist() print(len(cols_numeric), 'numeric columns, of which', len(cols_continuous), 'are continuous and', len(cols_discrete), 'are discrete.') # extract names of categorical columns cols_categ = dtypes[~dtypes.index.isin(cols_numeric)].index.tolist() for col in cols_categ: df_all[col] = df_all[col].astype('category') print(len(cols_categ), 'categorical columns.') # 10.Correlation Between Numeric Features ######### # correlation between numeric variables df_corr = df_all.loc[self.id_train, cols_numeric].corr(method='spearman').abs() # order columns and rows by correlation with SalePrice df_corr = df_corr.sort_values( 'SalePrice', axis=0, ascending=False).sort_values('SalePrice', axis=1, ascending=False) print(df_corr.SalePrice.head(20)) print('-----------------') print(df_corr.SalePrice.tail(10)) # 11.Normalise Numeric Features ######### # normalise numeric columns scale_cols = [col for col in cols_numeric if col != 'SalePrice'] df_all[scale_cols] = df_all[scale_cols].apply(ProcessData.scale_minmax, axis=0) df_all[scale_cols].describe() # 12.Box-Cox Transform Suitable Variables ######### # variables not suitable for box-cox transformation (usually due to excessive zeros) cols_notransform = [ '2ndFlrSF', '1stFlrFrac', '2ndFlrFrac', 'StorageAreaSF', 'EnclosedPorch', 'LowQualFinSF', 'MasVnrArea', 'MiscVal', 'ScreenPorch', 'OpenPorchSF', 'WoodDeckSF', 'SalePrice', 'BsmtGLQSF', 'BsmtALQSF', 'BsmtBLQSF', 'BsmtRecSF', 'BsmtLwQSF', 'BsmtUnfSF', 'BsmtGLQFrac', 'BsmtALQFrac', 'BsmtBLQFrac', 'BsmtRecFrac', 'BsmtLwQFrac', 'BsmtUnfFrac' ] cols_transform = [ col for col in cols_continuous if col not in cols_notransform ] # transform remaining variables print('Transforming', len(cols_transform), 'columns:', cols_transform) for col in cols_transform: # transform column df_all.loc[:, col], _ = stats.boxcox(df_all.loc[:, col] + 1) # renormalise column df_all.loc[:, col] = ProcessData.scale_minmax(df_all.loc[:, col]) # 13.Prepare Data for Model Fitting ######### # select features, encode categoricals, create dataframe for model fitting # select which features to use (all for now) model_cols = df_all.columns # encode categoricals self.df_model = pd.get_dummies(df_all[model_cols]) # Rather than including Condition1 and Condition2, or Exterior1st and Exterior2nd, # combine the dummy variables (allowing 2 true values per property) if ('Condition1' in model_cols) and ('Condition2' in model_cols): cond_suffix = [ 'Artery', 'Feedr', 'Norm', 'PosA', 'PosN', 'RRAe', 'RRAn', 'RRNn' ] for suffix in cond_suffix: col_cond1 = 'Condition1_' + suffix col_cond2 = 'Condition2_' + suffix self.df_model[col_cond1] = self.df_model[ col_cond1] | self.df_model[col_cond2] self.df_model.drop(col_cond2, axis=1, inplace=True) if ('Exterior1st' in model_cols) and ('Exterior2nd' in model_cols): # some different strings in Exterior1st and Exterior2nd for same type - rename columns to correct self.df_model.rename(columns={ 'Exterior2nd_Wd Shng': 'Exterior2nd_WdShing', 'Exterior2nd_Brk Cmn': 'Exterior2nd_BrkComm', 'Exterior2nd_CmentBd': 'Exterior2nd_CemntBd' }, inplace=True) ext_suffix = [ 'AsphShn', 'BrkComm', 'BrkFace', 'CBlock', 'CemntBd', 'HdBoard', 'ImStucc', 'MetalSd', 'Plywood', 'Stone', 'Stucco', 'VinylSd', 'Wd Sdng', 'WdShing', 'AsbShng' ] for suffix in ext_suffix: col_cond1 = 'Exterior1st_' + suffix col_cond2 = 'Exterior2nd_' + suffix self.df_model[col_cond1] = self.df_model[ col_cond1] | self.df_model[col_cond2] self.df_model.drop(col_cond2, axis=1, inplace=True) print(self.df_model.head()) # 14.Identify and Remove Outliers ######### # get training data self.split_data_to_train_validation() # find and remove outliers using a Ridge model outliers = self.find_outliers( Ridge(), self.df_model.loc[self.id_train], self.df_model.loc[self.id_train].SalePrice) # permanently remove these outliers from the data self.df_model = self.df_model.drop(outliers) self.id_train = self.id_train.drop(outliers)
train_test['photos_count'] = train_test['photos'].apply(lambda x: len(x)) train_test.drop(['photos', 'display_address', 'street_address'], axis=1, inplace=True) categoricals = [ x for x in train_test.columns if train_test[x].dtype == 'object' ] for feat in categoricals: lbl = preprocessing.LabelEncoder() lbl.fit(list(train_test[feat].values)) train_test[feat] = lbl.transform(list(train_test[feat].values)) bc_price, tmp = boxcox(train_test.price) train_test['bc_price'] = bc_price train_test.drop('price', axis=1, inplace=True) train_test['bathrooms_cat'] = train_test['bathrooms'].apply(lambda x: str(x)) train_test['bathrooms_cat'], labels = pd.factorize( train_test['bathrooms_cat'].values, sort=True) train_test.drop('bathrooms', axis=1, inplace=True) train_test['bedroom_cat'], labels = pd.factorize(train_test['bedrooms'].values, sort=True) train_test.drop('bedrooms', axis=1, inplace=True) features = list(train_test.columns)
def _predict(self, h=None, smoothing_level=None, smoothing_slope=None, smoothing_seasonal=None, initial_level=None, initial_slope=None, damping_slope=None, initial_seasons=None, use_boxcox=None, lamda=None, remove_bias=None, is_optimized=None): """ Helper prediction function Parameters ---------- h : int, optional The number of time steps to forecast ahead. """ # Variable renames to alpha, beta, etc as this helps with following the # mathematical notation in general alpha = smoothing_level beta = smoothing_slope gamma = smoothing_seasonal phi = damping_slope # Start in sample and out of sample predictions data = self.endog damped = self.damped seasoning = self.seasoning trending = self.trending trend = self.trend seasonal = self.seasonal m = self.seasonal_periods phi = phi if damped else 1.0 if use_boxcox == 'log': lamda = 0.0 y = boxcox(data, 0.0) elif isinstance(use_boxcox, float): lamda = use_boxcox y = boxcox(data, lamda) elif use_boxcox: y, lamda = boxcox(data) else: lamda = None y = data.squeeze() if np.ndim(y) != 1: raise NotImplementedError('Only 1 dimensional data supported') y_alpha = np.zeros((self.nobs,)) y_gamma = np.zeros((self.nobs,)) alphac = 1 - alpha y_alpha[:] = alpha * y if trending: betac = 1 - beta if seasoning: gammac = 1 - gamma y_gamma[:] = gamma * y lvls = np.zeros((self.nobs + h + 1,)) b = np.zeros((self.nobs + h + 1,)) s = np.zeros((self.nobs + h + m + 1,)) lvls[0] = initial_level b[0] = initial_slope s[:m] = initial_seasons phi_h = np.cumsum(np.repeat(phi, h + 1)**np.arange(1, h + 1 + 1) ) if damped else np.arange(1, h + 1 + 1) trended = {'mul': np.multiply, 'add': np.add, None: lambda l, b: l }[trend] detrend = {'mul': np.divide, 'add': np.subtract, None: lambda l, b: 0 }[trend] dampen = {'mul': np.power, 'add': np.multiply, None: lambda b, phi: 0 }[trend] nobs = self.nobs if seasonal == 'mul': for i in range(1, nobs + 1): lvls[i] = y_alpha[i - 1] / s[i - 1] + \ (alphac * trended(lvls[i - 1], dampen(b[i - 1], phi))) if trending: b[i] = (beta * detrend(lvls[i], lvls[i - 1])) + \ (betac * dampen(b[i - 1], phi)) s[i + m - 1] = y_gamma[i - 1] / trended(lvls[i - 1], dampen(b[i - 1], phi)) + \ (gammac * s[i - 1]) slope = b[1:nobs + 1].copy() season = s[m:nobs + m].copy() lvls[nobs:] = lvls[nobs] if trending: b[:nobs] = dampen(b[:nobs], phi) b[nobs:] = dampen(b[nobs], phi_h) trend = trended(lvls, b) s[nobs + m - 1:] = [s[(nobs - 1) + j % m] for j in range(h + 1 + 1)] fitted = trend * s[:-m] elif seasonal == 'add': for i in range(1, nobs + 1): lvls[i] = y_alpha[i - 1] - (alpha * s[i - 1]) + \ (alphac * trended(lvls[i - 1], dampen(b[i - 1], phi))) if trending: b[i] = (beta * detrend(lvls[i], lvls[i - 1])) + \ (betac * dampen(b[i - 1], phi)) s[i + m - 1] = y_gamma[i - 1] - \ (gamma * trended(lvls[i - 1], dampen(b[i - 1], phi))) + \ (gammac * s[i - 1]) slope = b[1:nobs + 1].copy() season = s[m:nobs + m].copy() lvls[nobs:] = lvls[nobs] if trending: b[:nobs] = dampen(b[:nobs], phi) b[nobs:] = dampen(b[nobs], phi_h) trend = trended(lvls, b) s[nobs + m - 1:] = [s[(nobs - 1) + j % m] for j in range(h + 1 + 1)] fitted = trend + s[:-m] else: for i in range(1, nobs + 1): lvls[i] = y_alpha[i - 1] + \ (alphac * trended(lvls[i - 1], dampen(b[i - 1], phi))) if trending: b[i] = (beta * detrend(lvls[i], lvls[i - 1])) + \ (betac * dampen(b[i - 1], phi)) slope = b[1:nobs + 1].copy() season = s[m:nobs + m].copy() lvls[nobs:] = lvls[nobs] if trending: b[:nobs] = dampen(b[:nobs], phi) b[nobs:] = dampen(b[nobs], phi_h) trend = trended(lvls, b) fitted = trend level = lvls[1:nobs + 1].copy() if use_boxcox or use_boxcox == 'log' or isinstance(use_boxcox, float): fitted = inv_boxcox(fitted, lamda) level = inv_boxcox(level, lamda) slope = detrend(trend[:nobs], level) if seasonal == 'add': season = (fitted - inv_boxcox(trend, lamda))[:nobs] else: # seasonal == 'mul': season = (fitted / inv_boxcox(trend, lamda))[:nobs] sse = sqeuclidean(fitted[:-h - 1], data) # (s0 + gamma) + (b0 + beta) + (l0 + alpha) + phi k = m * seasoning + 2 * trending + 2 + 1 * damped aic = self.nobs * np.log(sse / self.nobs) + k * 2 if self.nobs - k - 3 > 0: aicc_penalty = (2 * (k + 2) * (k + 3)) / (self.nobs - k - 3) else: aicc_penalty = np.inf aicc = aic + aicc_penalty bic = self.nobs * np.log(sse / self.nobs) + k * np.log(self.nobs) resid = data - fitted[:-h - 1] if remove_bias: fitted += resid.mean() self.params = {'smoothing_level': alpha, 'smoothing_slope': beta, 'smoothing_seasonal': gamma, 'damping_slope': phi if damped else np.nan, 'initial_level': lvls[0], 'initial_slope': b[0] / phi, 'initial_seasons': s[:m], 'use_boxcox': use_boxcox, 'lamda': lamda, 'remove_bias': remove_bias} # Format parameters into a DataFrame codes = ['alpha', 'beta', 'gamma', 'l.0', 'b.0', 'phi'] codes += ['s.{0}'.format(i) for i in range(m)] idx = ['smoothing_level', 'smoothing_slope', 'smoothing_seasonal', 'initial_level', 'initial_slope', 'damping_slope'] idx += ['initial_seasons.{0}'.format(i) for i in range(m)] formatted = [alpha, beta, gamma, lvls[0], b[0], phi] formatted += s[:m].tolist() formatted = list(map(lambda v: np.nan if v is None else v, formatted)) formatted = np.array(formatted) if is_optimized is None: optimized = np.zeros(len(codes), dtype=np.bool) else: optimized = is_optimized.astype(np.bool) included = [True, trending, seasoning, True, trending, damped] included += [True] * m formatted = pd.DataFrame([[c, f, o] for c, f, o in zip(codes, formatted, optimized)], columns=['name', 'param', 'optimized'], index=idx) formatted = formatted.loc[included] hwfit = HoltWintersResults(self, self.params, fittedfcast=fitted, fittedvalues=fitted[:-h - 1], fcastvalues=fitted[-h - 1:], sse=sse, level=level, slope=slope, season=season, aic=aic, bic=bic, aicc=aicc, resid=resid, k=k, params_formatted=formatted, optimized=optimized) return HoltWintersResultsWrapper(hwfit)
def __box_cox_transform(a): from scipy import stats boxcox, maxlog = stats.boxcox(a, lmbda=None, alpha=None) return boxcox
#Separating the numeric features df_numeric = df[[ 'subscriber', 'Trend_day_count', 'Tag_count', 'Trend_tag_count', 'comment_count', 'likes', 'dislike' ]] df_numeric["Trend_tag_count"] = df["Trend_tag_count"].astype('int64') import numpy as np df_numeric1 = np.log1p( df_numeric) #log transforming the independent variables to remove skewness #transforming the response variable y = df['views'] from scipy import stats from scipy.stats import boxcox box_y = boxcox(box_y, lmbda=0.0) #to change as normal distribution y = y.dropna() box_y = y.copy() ###### NlP part##### #preprocessing text data #removed description as it contains unwanted info n_data = df[["channel_title", "title", "tags"]] n_data = n_data.dropna() n_data["tags"] = n_data["tags"].str.split('|').str.join(' ') n_data["np"] = n_data["channel_title"] + ' ' + n_data["title"] + ' ' + n_data[ "tags"] nlp_data = n_data[["np"]] from nltk.corpus import stopwords from sklearn.feature_extraction.text import TfidfTransformer
df = pd.read_csv( '/Users/nikita/PycharmProjects/ML_Tasks/course5/week1/timelines/WAG_C_M.csv', index_col=['month'], parse_dates=['month'], dayfirst=True, sep=';') df.plot() plt.ylabel('Salary(rubles') plt.show() print("Критерий Дики-Фуллера Оригинал: p=%f" % sm.tsa.stattools.adfuller(df.WAG_C_M)[1]) # Ряд Нестационарный, с трендом на повышение и сезонностью с периодом в год sm.tsa.seasonal_decompose(df.WAG_C_M).plot() plt.show() # Сделаем преобразование Бокса-Кокса для стабилизации дисперсии: df["salary_bxcx"], lmbda = stats.boxcox(df.WAG_C_M) df["salary_bxcx"].plot() plt.ylabel(u'Transformed Salaries') print("Оптимальный параметр преобразования Бокса-Кокса: %f" % lmbda) print("Критерий Дики-Фуллера После преобразования Бокса-Кокса: p=%f" % sm.tsa.stattools.adfuller(df["salary_bxcx"])[1]) plt.show() # Ряд сгладился, но все ще существенно не стационарный # Попробуем сезонное дифференцирование c Сезонным лагом 12 df['salary_box_diff'] = df.salary_bxcx - df.salary_bxcx.shift(12) sm.tsa.seasonal_decompose(df.salary_box_diff[12:]).plot() print( "Критерий Дики-Фуллера преобразования Бокса-Кокса и Сезонного дифференцирования: p=%f" % sm.tsa.stattools.adfuller(df.salary_box_diff[12:])[1]) plt.show() # Критерий Дики-Фуллера <0.05 но тренд виден
def normalize_data(ingredient_attribute, prediction_attributes): """ My clusters were not differenciating well, and after some reading, I realized that this was because the data is not a normal/gaussian distribution. GMM expects the input data to follow a normal distribution Testing distribution of the features: The plots illustrustrate the non-normal distribution of each column. They appear to be more geometric or loglike, so we need to normalize them. Reference: http://www.kmdatascience.com/2017/07/box-cox-transformations-in-python.html After some more reading and testing, I learned about the box-cox transformation method and decided to use it to transform the columns. Commenting out the plot in the for loop for submission because sometimes it leads to memory issues with so many plots. Sci-py has some built -in box cox transformations, so I decided to use that. """ for i in ingredient_attribute[prediction_attributes].columns: """plot = sns.distplot(ingredient_attribute[i], hist=True, kde=False, bins=int((ingredient_attribute[i].max() - ingredient_attribute[i].min())/10), color='green', hist_kws={'edgecolor':'black'}) # clear the plot otherwise each iteration of the loop will place a new graph on top plt.clf() """ # clean the data -- # from Open Food Facts: https://static.openfoodfacts.org/data/data-fields.txt # "fields that end with _100g correspond to the amount of a nutriment (in g, or kJ for energy) for 100 g or 100 ml of product" if i.endswith("_100g") and np.issubdtype(ingredient_attribute[i].dtype, np.number): # based on the field description, the range of these columns = 0 to 100 # drop columns with neg values ingredient_attribute = ingredient_attribute.drop( ingredient_attribute[ingredient_attribute[i] < 0].index) # and those > 100 ingredient_attribute = ingredient_attribute.drop( ingredient_attribute[ingredient_attribute[i] > 100].index) # get values to transform transform = np.asarray(ingredient_attribute[i].values) # boxcox requires strictly positive values (> 0), so resetting zeros to a small pos # getZeros = transform[transform < 1] = 1 """ I found this example on Kaggle which looks very similar to the what I am trying to do: https://www.kaggle.com/allunia/hidden-treasures-in-our-groceries The author mentions that the lambda values are very important, so I decided to compare what mine were: 1) energy | 0.7 | 0.617 2) carbs | 0.9 | -0.346 3) fat | 0.5 | -82.03 4) protein| 0.1 | -6.44 5) sugar | 0.03 | - 1.37 6) salt | 0.005 | -2.44 Trying out different ones did not help with the sihouette score in my case, so I decided to leave the default The author also mentions using only 3 different variable for the clusters, so I decided to use the ones that required the least amount of normalization (energy, carbs and sugar). These three are also the ones with the lowest % of zeros. """ output = stats.boxcox(transform) transformed_data = output[0] # save back the transformed data ingredient_attribute[i] = transformed_data # how many bins should the histogram plot have? calculated using range of each column bins = int((transformed_data.max() - transformed_data.min()) / 10) if bins < 3: bins = 5 """ ## replot to see the difference plot = sns.distplot(transformed_data, hist=True, kde=False, bins= bins, color='orange', hist_kws={'edgecolor':'black'}) # and clear again plt.clf() """ ingredient_attribute.to_csv("./data/transformed_data_us.csv") return ingredient_attribute
def box_cox(self, data_df, group, attribute): data_df[attribute] = data_df.groupby(group)[attribute].transform( lambda x: scipy.boxcox(x)[0]) return data_df
print("## Extracting Box-Cox features ") # print("## Plot Transformations for **amount**:") # figure = plt.figure(figsize=(16, 5)) # figure.add_subplot(131) # plt.title("Amount Histogram") # plt.hist(tmpData['amount'] ,facecolor='blue',alpha=0.75) # plt.xlabel("Transaction amount") # plt.text(10,100000,"Skewness: {0:.2f}".format(skew(tmpData['amount']))) # figure.add_subplot(132) # plt.title("SQRT on amount histogram") # plt.hist(np.sqrt(tmpData['amount']), facecolor = 'red', alpha=0.5) # plt.xlabel("Square root of amount") # plt.text(10, 100000, "Skewness: {0:.2f}".format(skew(np.sqrt(tmpData['amount'])))) tmpData['amount_boxcox'] = preprocessing.scale( boxcox(tmpData['amount'] + 1)[0]) # figure.add_subplot(133) # plt.title("Box-cox on amount histogram") # plt.hist(tmpData['amount_boxcox'], facecolor = 'red', alpha=0.5) # plt.xlabel("Box cox of amount") # plt.text(10, 100000, "Skewness: {0:.2f}".format(skew(tmpData['amount_boxcox']))) # plt.show() # High skewness on left side but box-cox reveals normal distribution # print("## Plot Transformations for **oldbalanceOrg**:") # figure = plt.figure(figsize=(16, 5)) # figure.add_subplot(131) # plt.title("oldbalanceOrg Histogram") # plt.hist(tmpData['oldbalanceOrg'] ,facecolor='blue',alpha=0.75) # plt.xlabel("old balance originated") # plt.text(2,100000,"Skewness: {0:.2f}".format(skew(tmpData['oldbalanceOrg'])))
def redraw(self): if self.XcomboBox.currentText() == self.YcomboBox.currentText(): QtWidgets.QMessageBox.critical(self, 'Error', 'Variables \n must be different !', QtWidgets.QMessageBox.Ok) return () if (self.XcomboBox.currentText() == 'Auto') and ( self.YcomboBox.currentText() == 'All') and not self.scatterradioButton.isChecked(): QtWidgets.QMessageBox.critical( self, 'Error', "You have to select two rows \n for this kind of plot!", QtWidgets.QMessageBox.Ok) return () data = DS.Raw.iloc[DS.Ir, DS.Ic] data = data.assign(Lr=DS.Lr[DS.Ir]) data = data.assign(Cr=DS.Cr[DS.Ir]) data = data.assign(Gr=DS.Gr[DS.Ir]) if (self.XcomboBox.currentText() != 'Auto') and (self.YcomboBox.currentText() != 'All'): data = data.loc[[ self.XcomboBox.currentText(), self.YcomboBox.currentText() ]] elif (self.XcomboBox.currentText() != 'Auto') and (self.YcomboBox.currentText() == 'All'): QtWidgets.QMessageBox.critical(self, 'Error', "Select two rows!", QtWidgets.QMessageBox.Ok) return () elif (self.XcomboBox.currentText() == 'Auto') and (self.YcomboBox.currentText() != 'All'): QtWidgets.QMessageBox.critical(self, 'Error', "Use Univariate plot!", QtWidgets.QMessageBox.Ok) return () Nnan = data.isnull().isnull().all().all() data = data.T.dropna() data = data.T Lr = data['Lr'].values Cr = data['Cr'].values Gr = data['Gr'].values data = data.drop('Lr', axis=1) data = data.drop('Cr', axis=1) data = data.drop('Gr', axis=1) if data.dtypes.all() == 'float' and data.dtypes.all() == 'int': QtWidgets.QMessageBox.critical(self,'Error',"Some values are not numbers!",\ QtWidgets.QMessageBox.Ok) return () if (self.XcomboBox.currentText() != 'Auto') and (self.YcomboBox.currentText() != 'All'): if data.shape[0] != 2: QtWidgets.QMessageBox.critical(self,'Error',"Raw labels must be different",\ QtWidgets.QMessageBox.Ok) return () x = data.loc[self.XcomboBox.currentText()].values y = data.loc[self.YcomboBox.currentText()].values fig = Figure() ax = fig.add_subplot(111) color = 'blue' if self.scatterradioButton.isChecked(): if (self.XcomboBox.currentText() != 'Auto') and (self.YcomboBox.currentText() != 'All'): if self.PcheckBox.isChecked(): ax.scatter(x, y, marker='o', color=Cr) if self.LcheckBox.isChecked(): ax.plot(x, y, color='blue') if self.VcheckBox.isChecked(): for i, txt in enumerate(Lr): ax.annotate(txt, (x[i], y[i])) ax.set_xlabel(self.XcomboBox.currentText()) ax.set_ylabel(self.YcomboBox.currentText()) else: nr, nc = data.shape Lc = DS.Lc[DS.Ic] x = range(1, nc + 1) color = Cr if self.GcheckBox.isChecked(): groups = Gr ngr = len(np.unique(groups)) color = [] for key in groups: color.append(cm.viridis.colors[int( (len(cm.viridis.colors) - 1) / ngr * key)]) for i in range(nr): y = data.iloc[i, :] col = color[i] if self.GcomboBox.currentText() == 'All': if self.PcheckBox.isChecked(): ax.scatter(x, y, marker='o', color=col) if self.LcheckBox.isChecked(): ax.plot(x, y, color=col) else: if int(self.GcomboBox.currentText()) == groups[i]: if self.PcheckBox.isChecked(): ax.scatter(x, y, marker='o', color=col) if self.LcheckBox.isChecked(): ax.plot(x, y, color=col) if (nc > 30): itick = np.linspace(0, nc - 1, 20).astype(int) ltick = Lc[itick] else: itick = x ltick = Lc ax.set_xlim([0, nc + 2]) ax.set_xticks(itick) ax.set_xticklabels(ltick, rotation='vertical') if self.ellipseradioButton.isChecked(): def plot_ellipse(x, y, nstd=2, ax=None, **kwargs): def eigsorted(cov): vals, vecs = np.linalg.eigh(cov) order = vals.argsort()[::-1] return vals[order], vecs[:, order] pos = (x.mean(), y.mean()) cov = np.cov(x, y).tolist() vals, vecs = eigsorted(cov) theta = np.degrees(np.arctan2(*vecs[:, 0][::-1])) width, height = 2 * nstd * np.sqrt(vals) ellip = Ellipse(xy=pos, width=width, height=height, angle=theta, fill=False, **kwargs) ax.add_artist(ellip) return ellip for j in range(1, 4): plot_ellipse(x, y, j, ax) ax.scatter(x, y) ax.set_xlabel(self.XcomboBox.currentText()) ax.set_ylabel(self.YcomboBox.currentText()) ax.set_title('Ellipse for 1,2,3 times the Standard Deviation') if self.boxcoxradioButton.isChecked(): if (not (x > 0).all()) and (not (y > 0).all()): QtWidgets.QMessageBox.critical(self,'Error',"Values must be strictly positive",\ QtWidgets.QMessageBox.Ok) return () CBC = np.zeros(50) vlambda = np.linspace(-2, 2, 50) for i in range(50): trans_x = stats.boxcox(x, vlambda[i]) CBC[i] = np.corrcoef(trans_x, y)[0, 1] if self.PcheckBox.isChecked(): ax.scatter(vlambda, CBC, marker='o', color=color) if self.LcheckBox.isChecked(): ax.plot(vlambda, CBC, color=color) ax.set_xlabel('Lambda') ax.set_ylabel('Correlation Coefficient') if self.histogramradioButton.isChecked(): cx = 'blue' cy = 'red' xm = x.mean() ym = y.mean() xstd = x.std() ystd = y.std() dy = (ym - 3 * ystd) - (xm + 3 * xstd) dx = (xm - 3 * xstd) - (ym + 3 * ystd) if (dy > 0) | (dx > 0): x = sk.preprocessing.normalize(x.reshape(1, -1), norm='l2', axis=1, copy=True, return_norm=False) y = sk.preprocessing.normalize(y.reshape(1, -1), norm='l2', axis=1, copy=True, return_norm=False) x = x.ravel() y = y.ravel() ax.set_xlabel('Normalized Quantities') iqr = np.percentile(x, [75, 25]) iqr = iqr[0] - iqr[1] n = x.size dx = abs(max((x.max(), y.max())) - min((x.min(), y.min()))) nbins = int(np.floor(dx / (2 * iqr) * n**(1 / 3))) + 1 if nbins > self.spinBox.value(): self.spinBox.setValue(nbins) else: nbins = self.spinBox.value() bins = np.linspace(min((x.min(), y.min())), max( (x.max(), y.max())), nbins) ax.hist(x, bins=bins, histtype='bar', color=cx, alpha=0.5, orientation='vertical', label=str(self.XcomboBox.currentText())) ax.hist(y, bins=bins, histtype='bar', color=cy, alpha=0.5, orientation='vertical', label=str(self.YcomboBox.currentText())) box = ax.get_position() ax.set_position([box.x0, box.y0, box.width * 0.8, box.height]) ax.legend(bbox_to_anchor=(1, 1), loc='upper left', borderaxespad=0.2) if Nnan: ax.annotate('{:04.2f} NaN'.format(Nnan), xy=(0.80, 0.95), xycoords='figure fraction') if self.XcheckBox.isChecked(): if self.XlineEdit.text(): ax.set_xlabel(self.XlineEdit.text()) else: ax.set_xlabel('') if self.YcheckBox.isChecked(): if self.YlineEdit.text(): ax.set_ylabel(self.YlineEdit.text()) else: ax.set_ylabel('') if self.XGcheckBox.isChecked(): ax.xaxis.grid(True) if self.YGcheckBox.isChecked(): ax.yaxis.grid(True) if self.TlineEdit.text(): ax.set_title(self.TlineEdit.text()) if not self.XMcheckBox.isChecked(): ax.tick_params(axis='x', which='both', bottom='off', top='off', labelbottom='off') if not self.YMcheckBox.isChecked(): ax.tick_params(axis='y', which='both', left='off', right='off', labelleft='off') self.rmmpl() self.addmpl(fig)
def development_score_by_draft_count(self, players_df, coaches_df): drafted = players_df[~players_df['drafted'].isnull()] drafted = drafted.groupby(['stars']).size() all = players_df.groupby(['stars']).size() drafted_fraction = pd.concat([drafted, all], axis=1) drafted_fraction.columns = ['drafted', 'total'] drafted_fraction['stars'] = drafted_fraction.index drafted_fraction = drafted_fraction.reset_index(drop=True) drafted_fraction['fraction'] = drafted_fraction[ 'drafted'] / drafted_fraction['total'] drafted_fraction.to_csv( os.path.join(self.output_dir, 'drafted_fraction_by_star.csv')) for index, coach in coaches_df.iterrows(): players = players_df[(players_df['team'] == coach['team'])] cond1 = (players['drafted'] > coach['first_year']) & (players['drafted'] <= (coach['last_year']) + 1) cond2 = ((players['enrolled'] + players['ncaaf_years']) > coach['first_year']) & (players['enrolled'] <= (coach['last_year'])) cond3 = ((players['enrolled'] + 4) > coach['first_year']) & (players['enrolled'] <= (coach['last_year'])) players = players[cond1 | cond2 | cond3] coach_drafted = players[~players['drafted'].isnull()] coach_drafted = coach_drafted.groupby(['stars']).size() coach_all = players.groupby(['stars']).size() coach_drafted_fraction = pd.concat([coach_drafted, coach_all], axis=1) coach_drafted_fraction.columns = ['drafted', 'total'] coach_drafted_fraction['stars'] = coach_drafted_fraction.index coach_drafted_fraction = coach_drafted_fraction.reset_index( drop=True) coach_drafted_fraction['fraction'] = coach_drafted_fraction[ 'drafted'] / coach_drafted_fraction['total'] coach_drafted_fraction = coach_drafted_fraction[[ 'stars', 'drafted', 'total', 'fraction' ]] merged = pd.merge(left=coach_drafted_fraction, right=drafted_fraction, how='left', left_on='stars', right_on='stars') merged = merged.dropna(subset=['total_x']) merged = merged[merged['total_x'] >= 5] merged['fraction_x'] = merged['fraction_x'].fillna(0) perc_diff = (merged['fraction_x'] - merged['fraction_y']) / (merged['fraction_y']) * 100 coaches_df.loc[index, 'development_ability'] = perc_diff.median() coaches_df = coaches_df.dropna(subset=['development_ability']) coaches_df = coaches_df.reset_index(drop=True) coaches_df['development_ability'] = coaches_df['development_ability'] + \ coaches_df['development_ability'].min() * -1 + .01 coaches_df['development_ability'] = coaches_df[ 'development_ability'].transform(lambda x: scipy.boxcox(x)[0]) mean_percent_drafted = coaches_df['development_ability'].mean() std_percent_drafted = coaches_df['development_ability'].std() coaches_df['development_ability'] = ( coaches_df['development_ability'] - mean_percent_drafted) / std_percent_drafted coaches_df = coaches_df.reset_index(drop=True) return coaches_df
plt.plot(df_month.y, '-', label='true-values_By Months') plt.plot(df_month.original, '-', label='rew-data_By Months') plt.legend() # plt.tight_layout() plt.show() # 看趋势 plt.figure(figsize=[15, 7]) sm.tsa.seasonal_decompose(df_month.y).plot() print("work3 test: p={}".format(adfuller(df_month.y)[1])) # air_passengers test: p=0.996129346920727 # Box-Cox Transformations ts序列转换 df_month['y_box'], lmbda = stats.boxcox(df_month.y) print("work3 test: p={}".format(adfuller(df_month.y_box)[1])) # air_passengers test: p=0.7011194980409873 # Seasonal differentiation # 季节性差分确定sax中m参数 df_month['y_box_diff'] = df_month['y_box'] - df_month['y_box'].shift(12) # Seasonal differentiation # 季节性差分确定sax中m参数 df_month['y_box_diff'] = df_month['y_box'] - df_month['y_box'].shift(12) # Seasonal differentiation # 季节性差分确定sax中m参数 df_month['y_box_diff'] = df_month['y_box'] - df_month['y_box'].shift(12)
def price_prediction_data(): coin_of_interest = request.args.get('CoinName') print('test' + coin_of_interest) #Connect to Amazon SQL conn = engine.connect() # Grab coin prices query = ''' SELECT RecordDate, OpenPrice, High, Low, ClosingPrice, AdjClose, Volume, c.CoinName, cph.TokenName FROM CoinPriceHistory cph INNER JOIN Coins c ON cph.CoinID = c.CoinID ORDER BY RecordDate ''' coin_raw = pd.read_sql(query, conn) # Grab coin names query = ''' SELECT CoinName FROM Coins ''' coin_names = pd.read_sql(query, conn) # NEED TO IMPLEMENT: LINK TO PRICE-PREDICT.HTML # Get coin-of-interest from user # coin_of_interest = request.form['coi'] # NEED TO IMPLMENT: PASS RESPONSE BACK TO PRICE-PREDICT.HTML # Determine if coin is in db # For now, we print it to the terminal. if coin_names[coin_names['CoinName'] == coin_of_interest].any().bool(): print("We got your coin!") else: print('We NO got your coin') # Clean data coin_history = coin_raw[coin_raw['CoinName'] == coin_of_interest] coin_history['RecordDate'] = pd.to_datetime(coin_history['RecordDate'], errors='coerce') coin_history.rename(columns={ 'RecordDate': 'Timestamp', 'OpenPrice': 'Open', 'ClosingPrice': 'Close' }, inplace=True) coin_history.set_index('Timestamp', inplace=True) coin_history.drop(['AdjClose', 'Volume', 'CoinName', 'TokenName'], axis=1, inplace=True) coin_history = coin_history[coin_history['Close'] != 0] # Transform data for ARIMA coin_history['Box'], lmbda = stats.boxcox(coin_history.Close) coin_history['BoxDiff'] = coin_history.Box - coin_history.Box.shift(12) coin_history[ 'BoxDiff2'] = coin_history.BoxDiff - coin_history.BoxDiff.shift(1) # Optimize ARIMA Prediction Qs = range(0, 2) qs = range(0, 3) Ps = range(0, 3) ps = range(0, 3) D = 1 d = 1 parameters = product(ps, qs, Ps, Qs) parameters_list = list(parameters) results = [] best_aic = float("inf") warnings.filterwarnings('ignore') #for param in parameters_list: param = parameters_list[0] try: model = sm.tsa.statespace.SARIMAX(coin_history.Box, order=(param[0], d, param[1]), seasonal_order=(param[2], D, param[3], 12)).fit(disp=-1) except: print('Data cannot be conditioned for ARIMA model. Sorry!' ) # Need to send this back to user aic = model.aic if aic < best_aic: best_model = model best_aic = aic best_param = param results.append([param, model.aic]) #Generate Price Prediction Data def invboxcox(y, lmbda): if lmbda == 0: return (np.exp(y)) else: return (np.exp(np.log(lmbda * y + 1) / lmbda)) coin_history_with_predictions = coin_history[['Close']] coin_history_with_predictions['Forecast'] = invboxcox( best_model.predict(start=0, end=(len(coin_history_with_predictions) - 1)), lmbda) prediction_dates = [ datetime(2021, 4, 30), datetime(2021, 5, 31), datetime(2021, 6, 30), datetime(2021, 7, 31), datetime(2021, 8, 31), datetime(2021, 9, 30), datetime(2021, 10, 31), datetime(2021, 11, 30), datetime(2021, 12, 31) ] future = pd.DataFrame(index=prediction_dates, columns=coin_history.columns) future['Forecast'] = invboxcox(best_model.forecast(steps=len(future)), lmbda).tolist() coin_history_with_predictions = pd.concat( [coin_history_with_predictions, future]) coin_history_with_predictions['Coin'] = coin_of_interest graph = coin_history_with_predictions.reset_index().rename( columns={'index': 'Date'}) graph2 = graph[['Coin', 'Date', 'Close', 'Forecast']] # Return Price Prediction Data to Plotly _json = graph2.to_json(orient='records') resp = make_response(_json) resp.headers['content-type'] = 'application/json' return resp
plt.figure(figsize=[15,7]) sm.tsa.seasonal_decompose(df_month.Weighted_Price).plot() print("Dickey–Fuller test: p=%f" % sm.tsa.stattools.adfuller(df_month.Weighted_Price)[1]) plt.show() # The series are not stationary. # ## Box-Cox Transformations # In[ ]: # Box-Cox Transformations df_month['Weighted_Price_box'], lmbda = stats.boxcox(df_month.Weighted_Price) print("Dickey–Fuller test: p=%f" % sm.tsa.stattools.adfuller(df_month.Weighted_Price)[1]) # The series are not stationary. # ## Seasonal differentiation # In[ ]: # Seasonal differentiation df_month['prices_box_diff'] = df_month.Weighted_Price_box - df_month.Weighted_Price_box.shift(12) print("Dickey–Fuller test: p=%f" % sm.tsa.stattools.adfuller(df_month.prices_box_diff[12:])[1])
def fit(self): """ Estimate a trend component, multiple seasonal components, and a residual component. Returns ------- DecomposeResult Estimation results. """ num_seasons = len(self.periods) iterate = 1 if num_seasons == 1 else self.iterate # Box Cox if self.lmbda == "auto": y, lmbda = boxcox(self._y, lmbda=None) self.est_lmbda = lmbda elif self.lmbda: y = boxcox(self._y, lmbda=self.lmbda) else: y = self._y # Get STL fit params stl_inner_iter = self._stl_kwargs.pop("inner_iter", None) stl_outer_iter = self._stl_kwargs.pop("outer_iter", None) # Iterate over each seasonal component to extract seasonalities seasonal = np.zeros(shape=(num_seasons, self.nobs)) deseas = y for _ in range(iterate): for i in range(num_seasons): deseas = deseas + seasonal[i] res = STL( endog=deseas, period=self.periods[i], seasonal=self.windows[i], **self._stl_kwargs, ).fit(inner_iter=stl_inner_iter, outer_iter=stl_outer_iter) seasonal[i] = res.seasonal deseas = deseas - seasonal[i] seasonal = np.squeeze(seasonal.T) trend = res.trend rw = res.weights resid = deseas - trend # Return pandas if endog is pandas if isinstance(self.endog, (pd.Series, pd.DataFrame)): index = self.endog.index y = pd.Series(y, index=index, name="observed") trend = pd.Series(trend, index=index, name="trend") resid = pd.Series(resid, index=index, name="resid") rw = pd.Series(rw, index=index, name="robust_weight") cols = [f"seasonal_{period}" for period in self.periods] if seasonal.ndim == 1: seasonal = pd.Series(seasonal, index=index, name="seasonal") else: seasonal = pd.DataFrame(seasonal, index=index, columns=cols) # Avoid circular imports from statsmodels.tsa.seasonal import DecomposeResult return DecomposeResult(y, seasonal, trend, resid, rw)
def lastF(y, m = 12, h = 12*2, comb = "OLS", aggList = None, include_history = True, cap = None, capF = None, \ changepoints = None, n_changepoints = 25, yearly_seasonality = True, weekly_seasonality = 'auto', daily_seasonality='auto', holidays = None, seasonality_prior_scale = 10.0, \ holidays_prior_scale = 10.0, changepoint_prior_scale = 0.05, mcmc_samples = 0, interval_width = 0.80, uncertainty_samples = 0, transform = None): """ Parameters ---------------- y - dataframe of time-series data Layout: 1st Col - Time instances 2nd Col - Total of TS m - (int) frequency of time series eg. weekly is 52 (len(y) > 2*m) h - (int) the forecast horizon for the time series comb (String) the type of hierarchical forecasting method that the user wants to use. Options: "OLS" - optimal combination by ordinary least squares (Default), "WLSS" - optimal combination by structurally weighted least squares, "WLSV" - optimal combination by variance weighted least squares "BU" - bottom up combination aggList - (list) The factors that the user would like to consider for ex. m = 52, aggList = [1, 52] include_history - (Boolean) input for the forecasting function of Prophet cap - (Dataframe or Constant) carrying capacity of the input time series. If it is a dataframe, then the number of columns must equal len(y.columns) - 1 capF - (Dataframe or Constant) carrying capacity of the future time series. If it is a dataframe, then the number of columns must equal len(y.columns) - 1 changepoints - (DataFrame or List) changepoints for the model to consider fitting. If it is a dataframe, then the number of columns must equal len(y.columns) - 1 n_changepoints - (constant or list) changepoints for the model to consider fitting. If it is a list, then the number of items must equal len(y.columns) - 1 transform - (None or "BoxCox") Do you want to transform your data before fitting the prophet function? If yes, type "BoxCox" All other inputs - see Prophet Returns ----------------- newDict - a dictionary of DataFrames with predictions, seasonalities and trends that can all be plotted """ ## # Error Catching ## if not isinstance(y.iloc[:,0], pd.DatetimeIndex): y.iloc[:,0] = pd.DatetimeIndex(y.iloc[:,0]) if m <= 1: sys.exit("Seasonal period (m) must be greater than 1") if len(y) < 2*m: sys.exit("Need at least 2 periods of data") if aggList is not None: if 1 not in aggList or m not in aggList: sys.exit("1 and the seasonal period must be included in the aggList input") ## # Compute Aggregate Time Series and return a dictionary of dataframes ## aggs = aggHier(y, m, aggList) ## # Transform Variables ## if transform is not None: if transform == 'BoxCox': import warnings warnings.simplefilter("error", RuntimeWarning) boxcoxT = [None]*(len(aggs.keys())) try: i = 0 placeHold = [] for key in sorted(aggs.keys()): placeHold.append(aggs[key].copy()) placeHold[i].iloc[:, 1], boxcoxT[i] = boxcox(placeHold[i].iloc[:, 1]) i += 1 i = 0 for key in sorted(aggs.keys()): aggs[key] = placeHold[i] i += 1 ## # Does a Natural Log Transform if scipy's boxcox cant deal ## except RuntimeWarning: print("It looks like scipy's boxcox function couldn't deal with your data. Proceeding with Natural Log Transform") i = 0 for key in sorted(aggs.keys()): aggs[key].iloc[:, 1] = boxcox(aggs[key].iloc[:, 1], lmbda = 0) boxcoxT[i] = 0 i += 1 else: print("Nothing will be transformed because the input was not = to 'BoxCox'") else: boxcoxT = None ## # Forecast and Reconcile ## with contextlib.redirect_stdout(open(os.devnull, "w")): forecastsDict, mse, resids = fitProphet(aggs, h, include_history, cap, capF, changepoints, n_changepoints, \ yearly_seasonality, weekly_seasonality, daily_seasonality, holidays, seasonality_prior_scale, \ holidays_prior_scale, changepoint_prior_scale, mcmc_samples, interval_width, uncertainty_samples) newDict = reconcile(forecastsDict, h, mse, resids, comb, boxcoxT) return newDict
sns.distplot(df_fixed['Fare'][:train_num]) plt.show() df_fixed = MMEncoder.fit_transform(df_fixed) train_X = df_fixed[:train_num] estimator = LogisticRegression() cross_val_score(estimator, train_X, train_Y, cv=5).mean() """# 作業2 * 最後的 boxcox 區塊直接執行會造成錯誤, 起因為輸入值有負值, 請問如何修正後可以使用 boxcox? (Hint : 試圖修正資料) """ # 將 Fare 取 boxcox 後, 看散佈圖, 並計算分數 (執行會有 error, 請試圖修正) from scipy import stats df_fixed = copy.deepcopy(df) """ df_fixed['LotArea'] = stats.boxcox(df_fixed['LotArea'], lmbda=0.15) """ df_fixed['Pclass'] = stats.boxcox(df_fixed['Pclass'], lmbda=0.15) sns.distplot(df_fixed['Pclass'][:train_num]) plt.show() df_fixed = MMEncoder.fit_transform(df_fixed) train_X = df_fixed[:train_num] estimator = LogisticRegression() cross_val_score(estimator, train_X, train_Y, cv=5).mean()
import numpy as np from scipy import stats as sts #算skewness #skewValues = sts.skew(cell_num) print(sts.skew(cell_num)) # numpy.ndarray type(sts.skew(cell_num)) skewValues = cell_num.apply(sts.skew, axis=0) # pandas.Series print(skewValues) ### Box-Cox Transformation # 先試AreaCh1前六筆(只接受一維陣列,自動估計lambda) from scipy import stats print(cell['AreaCh1'].head(6)) stats.boxcox(cell['AreaCh1'].head(6)) # stats.boxcox()輸出為兩元素,BC轉換後的AreaCh1與lambda估計值,行成的值組 type(stats.boxcox(cell['AreaCh1'].head(6))) # tuple # 分別取出BC轉換後的AreaCh1與lambda估計值 stats.boxcox(cell_num['AreaCh1'])[0] stats.boxcox(cell_num['AreaCh1'])[1] help(stats.boxcox) # 補充:另一種Box-Cox公式(可傳入二維陣列,但是要給lambda) from scipy.special import boxcox1p lam = 0.16 cell_num_bc = boxcox1p(cell_num, lam) cell_num_bc