def fit_transform(self, X, y=None): if len(X.shape) > 1: t = [stats.boxcox(X[:, col]) for col in range(X.shape[1])] xt, self.lambdas = zip(*t) return np.array(xt).T else: xt, self.lambdas = stats.boxcox(X) return np.array(xt)
def transform(self, X): X += self.shift if isinstance(self.lmbda, float): xb = boxcox(X, self.lmbda) else: xb = numpy.zeros(shape=X.shape) for j, lmb in enumerate(self.lmbda): xb[:, j] = boxcox(X[:, j], lmb) return (xb - self.xmean) / self.xstd
def better_loglikelihood(self,param_e): if self.result_producing_thing.typ=="emulator": self.result_producing_thing.emulate(param_e[0:-2]) if self.result_producing_thing.typ=="swmm": self.result_producing_thing.run(param_e[0:-2]) data=stats.boxcox((self.measurement>0)*self.measurement+0.01,0.35) mean=stats.boxcox((self.result_producing_thing.result>0)*self.result_producing_thing.result+0.01,0.35) covariance=param_e[-1]*self.cov_mat_b_base+\ self.cov_mat_e_base*param_e[-2] lik=-0.5*np.linalg.slogdet(covariance)[1]-\ 0.5*np.dot(mean-data,np.linalg.solve(covariance,mean-data))-\ 0.5*self.t*np.log(2*np.pi) return lik
def executeOneSetting(tensor, density, roundId, para): logger.info('density=%.2f, %2d-round starts.'%(density, roundId + 1)) (numUser, numService, numTime) = tensor.shape dim = para['dimension'] # initialization U = np.random.rand(numUser, dim) S = np.random.rand(numService, dim) p = np.zeros(numUser) q = np.zeros(numService) # run for each time slice for sliceId in xrange(numTime): # boxcox data transformation matrix = tensor[:, :, sliceId] dataVector = matrix[:] (transfVector, alpha) = stats.boxcox(dataVector[dataVector > 0]) maxV = np.max(transfVector) minV = np.min(transfVector) transfMatrix = matrix.copy() transfMatrix[transfMatrix != -1] = stats.boxcox(transfMatrix[transfMatrix != -1], alpha) transfMatrix[transfMatrix != -1] = (transfMatrix[transfMatrix != -1] - minV) / (maxV - minV) # remove data entries to generate trainMatrix and testMatrix seedID = roundId + sliceId * 100 (trainMatrix, testMatrix) = evallib.removeEntries(matrix, density, seedID) trainMatrix = np.where(trainMatrix > 0, transfMatrix, 0) (testVecX, testVecY) = np.where(testMatrix) testVec = matrix[testVecX, testVecY] # invocation to the prediction function startTime = time.clock() predictedMatrix = AMF.predict(trainMatrix, U, S, p, q, para) runningTime = float(time.clock() - startTime) # evaluate the estimation error predVec = predictedMatrix[testVecX, testVecY] predVec = (maxV - minV) * predVec + minV predVec = evallib.argBoxcox(predVec, alpha) evalResult = evallib.errMetric(testVec, predVec, para['metrics']) result = (evalResult, runningTime) # dump the result at each density outFile = '%s%s_%s_result_%02d_%.2f_round%02d.tmp'%(para['outPath'], para['dataName'], para['dataType'], sliceId + 1, density, roundId + 1) evallib.dumpresult(outFile, result) logger.info('sliceId=%02d done.'%(sliceId + 1)) logger.info('density=%.2f, %2d-round done.'%(density, roundId + 1)) logger.info('----------------------------------------------')
def test_fixed_lmbda(self): np.random.seed(12345) x = stats.loggamma.rvs(5, size=50) + 5 xt = stats.boxcox(x, lmbda=1) assert_allclose(xt, x - 1) xt = stats.boxcox(x, lmbda=-1) assert_allclose(xt, 1 - 1/x) xt = stats.boxcox(x, lmbda=0) assert_allclose(xt, np.log(x)) # Also test that array_like input works xt = stats.boxcox(list(x), lmbda=0) assert_allclose(xt, np.log(x))
def auto_arima(endog, freq=None, d=None, D=None, max_p=5, max_q=5, max_P=2, max_Q=2, max_order=5, max_d=2, max_D=1, start_p=2, start_q=2, start_P=1, start_Q=1, stationary=False, ic="aic", stepwise=True, trace=False, approximation=None, test="adf", seasonal_test="ch", allowdrift=True, allowmean=True, lambda_parameter=None, *args, **kwargs): # Parameter Validity Check if np.any(np.isnan(endog)): raise ValueError("Missing Values in Series") origin_endog = endog if _is_using_pandas(endog, None): endog = np.asarray(endog) if len(endog) <= 10: raise ValueError("There are too few observations.") if np.any(np.isnan(endog)): raise ValueError("NaN values in endogenous not allowed") if np.all(endog == endog[0]): raise ValueError("The endogenous variable is a constant") if (not isinstance(freq, int)) or freq <= 1: raise ValueError("The frequency parameter must be a integer greater than 1") if lambda_parameter is not None: if lambda_parameter < 0: raise ValueError("The Lambda parameter must be positive") if not np.all(endog > 0): raise ValueError("Box-Cox Transformation can be only used on positive series.") endog = boxcox(endog, lambda_parameter) max_p = max_p if max_p <= floor(len(endog) / 3) else floor(len(endog) / 3) max_q = max_q if max_q <= floor(len(endog) / 3) else floor(len(endog) / 3) max_P = max_P if max_P <= floor(len(endog) / 3 / freq) else floor(len(endog) / 3 / freq) max_Q = max_Q if max_Q <= floor(len(endog) / 3 / freq) else floor(len(endog) / 3 / freq) if stationary: D = 0 d = 0 if freq == 1:
def test_alpha(self): np.random.seed(1234) x = stats.loggamma.rvs(5, size=50) + 5 # Some regular values for alpha, on a small sample size _, _, interval = stats.boxcox(x, alpha=0.75) assert_allclose(interval, [4.004485780226041, 5.138756355035744]) _, _, interval = stats.boxcox(x, alpha=0.05) assert_allclose(interval, [1.2138178554857557, 8.209033272375663]) # Try some extreme values, see we don't hit the N=500 limit x = stats.loggamma.rvs(7, size=500) + 15 _, _, interval = stats.boxcox(x, alpha=0.001) assert_allclose(interval, [0.3988867, 11.40553131]) _, _, interval = stats.boxcox(x, alpha=0.999) assert_allclose(interval, [5.83316246, 5.83735292])
def test_mle(self): maxlog = stats.boxcox_normmax(self.x, method='mle') assert_allclose(maxlog, 1.758101, rtol=1e-6) # Check that boxcox() uses 'mle' _, maxlog_boxcox = stats.boxcox(self.x) assert_allclose(maxlog_boxcox, maxlog)
def readIn_PredictionData(fn,dfmax,dfmin,transformationFunction): df = pd.read_csv(fn,sep=",",header=False) # countij = 0 # for i in range(0,len(df.columns)): # for j in range(i+1,min(i+5,len(df.columns))): # countij = countij+1 # df['new'+str(countij)] = np.multiply(df[df.columns[i]],df[df.columns[j]]) print len(df.columns) for i in range(0,len(df.columns)): # if df.columns[i] != "selection": if transformationFunction == "bin": df[df.columns[i]] =[(x if x < 31 else 50 ) for x in df[df.columns[i]]] elif transformationFunction == "binlog": df[df.columns[i]] =[(0.5 if x==0 else (x if x < 31 else 50) ) for x in df[df.columns[i]]] df[df.columns[i]] = np.log(df[df.columns[i]]) elif transformationFunction == "log": df[df.columns[i]] =[(0.5 if x==0 else x) for x in df[df.columns[i]]] df[df.columns[i]] = np.log(df[df.columns[i]]) elif transformationFunction == "sqrt": df[df.columns[i]] = np.sqrt(df[df.columns[i]]) elif transformationFunction == "boxcox": df[df.columns[i]] = stats.boxcox(np.array(df[df.columns[i]]))[0] df[df.columns[i]] = normalize_predictioninput(np.array(df[df.columns[i]]),dfmax[i],dfmin[i]) return df
def sgs(data, xsteps=10, ysteps=10, nugget_dist=10, x_col='x_m', y_col='y_m', flux_col='flux', transform_data=True, invert_transform=True): x = data.x_m.values y = data.y_m.values flux = data.flux.values if transform_data: flux, L = scpstats.boxcox(flux) data = pd.DataFrame(np.c_[x, y, flux], columns=[x_col, y_col, flux_col]) new_x = [] new_y = [] new_flux = [] # create array for the output idx, grid, indexGrid, M = makePathAndGrid(data, xsteps, ysteps) for step in idx : point = [grid[0][step], grid[1][step]] model = kriging.krig_model(data, nugget_dist, x_col, y_col, flux_col) est = kriging.krig_sample(model, point) indexPoint = [indexGrid[0][step], indexGrid[1][step]] M[indexPoint[0], indexPoint[1]] = est x = np.r_[x, point[0]] new_x.append(x[-1]) y = np.r_[y, point[1]] new_y.append(y[-1]) flux = np.r_[flux, est] new_flux.append(flux[-1]) data = pd.DataFrame(np.c_[x, y, flux], columns=[x_col, y_col, flux_col]) if invert_transform and transform_data: M = invboxcox(M, L) new_flux = invboxcox(np.array(new_flux), L) return grid[0,:].reshape(M.shape), grid[1,:].reshape(M.shape), M, new_x, new_y, new_flux
def processing(data): #构造新特征 create_feature(data); #丢弃特征 data.drop(to_drop,axis=1,inplace=True) #填充None值,因为在特征说明中,None也是某些特征的一个值,所以对于这部分特征的缺失值以None填充 fill_none = ['MasVnrType','BsmtExposure','GarageType','MiscFeature'] for col in fill_none: data[col].fillna('None',inplace=True); #对其他缺失值进行填充,离散型特征填充众数,数值型特征填充中位数 na_col = data.dtypes[data.isnull().any()]; for col in na_col.index: if na_col[col] != 'object': med = data[col].median(); data[col].fillna(med,inplace=True); else: mode = data[col].mode()[0]; data[col].fillna(mode,inplace=True); #对正态偏移的特征进行正态转换,numeric_col就是数值型特征,zero_col是含有零值的数值型特征 #因为如果对含零特征进行转换的话会有各种各种的小问题,所以干脆单独只对非零数值进行转换 numeric_col = data.skew().index; zero_col = data.columns[data.isin([0]).any()] for col in numeric_col: #对于那些condition特征,例如取值是0,1,2,3...那些我不作变换,因为意义不大 if len(pd.value_counts(data[col])) <= 10 : continue; #如果是含有零值的特征,则只对非零值变换,至于用哪种形式变换,boxcox会自动根据数据来调整 if col in zero_col: trans_data = data[data>0][col]; before = abs(trans_data.skew()); cox,_ = boxcox(trans_data) log_after = abs(Series(cox).skew()); if log_after < before: data.loc[trans_data.index,col] = cox; #如果是非零值的特征,则全部作转换 else: before = abs(data[col].skew()); cox,_ = boxcox(data[col]) log_after = abs(Series(cox).skew()); if log_after < before: data.loc[:,col] = cox; #mapper值的映射转换 for col,mapp in mapper.items(): data.loc[:,col] = data[col].map(mapp);
def transform(self, x): x = np.asarray(x) if self.method == 'lambert': return np.array([self.w_t(x_i, tp_i) for x_i, tp_i in zip(x.T, self.trans_params)]).T elif self.method == 'boxcox': return np.array([boxcox(x_i, tp_i) for x_i, tp_i in zip(x.T, self.trans_params)]).T else: raise NotImplementedError
def boxcoxtransform(dataframe, numeric_feats): lam=defaultdict(float) skewed_feats = dataframe[numeric_feats].apply(lambda x: skew(x.dropna())) skewed_feats = skewed_feats[skewed_feats > 0.25] skewed_feats = skewed_feats.index for feats in skewed_feats: dataframe[feats] = dataframe[feats] + 1 dataframe[feats], lam[feats] = boxcox(dataframe[feats]) return dataframe, lam
def test_lmbda_None(self): np.random.seed(1234567) # Start from normal rv's, do inverse transform to check that # optimization function gets close to the right answer. np.random.seed(1245) lmbda = 2.5 x = stats.norm.rvs(loc=10, size=50000) x_inv = (x * lmbda + 1)**(-lmbda) xt, maxlog = stats.boxcox(x_inv) assert_almost_equal(maxlog, -1 / lmbda, decimal=2)
def fit(self, x): x = np.asarray(x) if self.method == 'lambert': for x_i in x.T: self.trans_params.append(self.iterate_moments(x_i, tol=self.tol, max_iter=self.max_iter)) elif self.method == 'boxcox': for x_i in x.T: self.trans_params.append(boxcox(x_i)[1]) else: raise NotImplementedError
def boxcox(x,y,y_label): box_cox, maxlog = stats.boxcox(y + abs(min(y)) + 1) regr.fit(x,box_cox) box_cox_predict = regr.predict(x) y_predict = inv_boxcox(box_cox_predict,maxlog) - abs(min(y)) - 1 print "R squared: " + str(np.var(y_predict)/np.var(y)) # Plot outputs fig = plt.figure() plt.scatter(y, y_predict, color='blue') plt.xlabel(y_label) plt.ylabel('predicted') plt.show()
def box_cox(df, lmbda=None, alpha=None): """ Performs a Box-Cox Transformation on all columns (features) of a pandas dataframe. Currently, there is some ambiguity as to how to deal with non-positive values & I need to check this out: at the moment, I just centre the data so that min(value) > 0, for all features, as necessitated by the very nature of the Box-Cox Transformation. """ df_tr = pd.DataFrame(columns=df.columns) #initialize empty data frame with same features as df for val in list(df.columns): df_tr[val] = stats.boxcox(df[val] - min(df[val]) + 0.1,lmbda, alpha)[0] #populate dataframe with transformed data return df_tr
def transform_data_to_gaussian_1D(feature_vector): """ Takes not-necessarily any distributed data and transforms it to a gaussian distribution using the box-cox transform """ import matplotlib.pyplot as plt x=feature_vector n=len(x) new_x,l=stats.boxcox(feature_vector) return new_x
def fit(self, X): xtrans = numpy.zeros(shape=X.shape) if len(X.shape) == 2: self.shift = -X.min(axis=0) self.shift[self.shift < 0] = 0 self.shift += 3 * X.std(axis=0) X += self.shift self.lmbda = numpy.zeros(X.shape[1]) for j in range(X.shape[1]): _, self.lmbda[j] = boxcox(X[:, j]) self.lmbda[j] = max(self.lmbda[j], self.minlmbda) self.lmbda[j] = min(self.lmbda[j], self.maxlmbda) if numpy.abs(self.lmbda[j]) < 1e-4: self.lmbda[j] = 0 print "changing lambda" xtrans[:, j] = boxcox(X[:, j], self.lmbda[j]) elif len(X.shape) == 1: self.shift = max([1e-10, -X.min()]) self.shift += 3 * X.std() X += self.shift xtrans, self.lmbda = boxcox(X) self.xmean = xtrans.mean(axis=0) self.xstd = xtrans.std(axis=0)
def append_boxcox(data, cols, drop_old=False): """Apply boxcox transformations to a list of columns data: a pandas DataFrame cols: a list of column names for which to perform boxcox transformations """ if isinstance(cols, basestring): cols = [cols] for col in cols: # boxcox also returns maxlog, the lambda param that is choosen # could be used for pipelining objects data[col + '_boxcox'] = stats.boxcox(data[col])[0] if drop_old: data.drop(col, axis=1, inplace=True)
def boxcox(X): """ Gaussianize X using the Box-Cox transformation: [samples x phenotypes] - each phentoype is brought to a positive schale, by first subtracting the minimum value and adding 1. - Then each phenotype transformed by the boxcox transformation """ X_transformed = sp.zeros_like(X) maxlog = sp.zeros(X.shape[1]) for i in range(X.shape[1]): i_nan = sp.isnan(X[:,i]) values = X[~i_nan,i] X_transformed[i_nan,i] = X[i_nan,i] X_transformed[~i_nan,i], maxlog[i] = st.boxcox(values-values.min()+1.0) return X_transformed, maxlog
def transform_features(x_train, x_test): """ Transform features using a boxcox transform. Remove vibrato features. Comptes the optimal value of lambda on the training set and applies this lambda to the testing set. Parameters ---------- x_train : np.array [n_samples, n_features] Untransformed training features. x_test : np.array [n_samples, n_features] Untransformed testing features. Returns ------- x_train_boxcox : np.array [n_samples, n_features_trans] Transformed training features. x_test_boxcox : np.array [n_samples, n_features_trans] Transformed testing features. """ x_train = x_train[:, 0:6] x_test = x_test[:, 0:6] _, n_feats = x_train.shape x_train_boxcox = np.zeros(x_train.shape) lmbda_opt = np.zeros((n_feats,)) eps = 1.0 # shift features away from zero for i in range(n_feats): x_train_boxcox[:, i], lmbda_opt[i] = boxcox(x_train[:, i] + eps) x_test_boxcox = np.zeros(x_test.shape) for i in range(n_feats): x_test_boxcox[:, i] = boxcox(x_test[:, i] + eps, lmbda=lmbda_opt[i]) return x_train_boxcox, x_test_boxcox
def mungeskewed(train, test, numeric_feats): ntrain = train.shape[0] test['loss'] = 0 train_test = pd.concat((train, test)).reset_index(drop=True) # compute skew and do Box-Cox transformation (Tilli) skewed_feats = train[numeric_feats].apply(lambda x: skew(x.dropna())) print("\nSkew in numeric features:") print(skewed_feats) skewed_feats = skewed_feats[skewed_feats > 0.25] skewed_feats = skewed_feats.index for feats in skewed_feats: train_test[feats] = train_test[feats] + 1 train_test[feats], lam = boxcox(train_test[feats]) return train_test, ntrain
def preprocess_feature(cls, feature, parameters): is_not_empty = 1 - np.isclose(feature, MISSING_VALUE) if parameters.feature_type == identify_types.BINARY: # Binary features are always 1 unless they are 0 return ((feature != 0) * is_not_empty).astype(np.float32) if parameters.boxcox_lambda is not None: feature = stats.boxcox( np.maximum(feature + parameters.boxcox_shift, BOX_COX_MARGIN), parameters.boxcox_lambda, ) # No *= to ensure consistent out-of-place operation. if parameters.feature_type == identify_types.PROBABILITY: feature = np.clip(feature, 0.01, 0.99) feature = special.logit(feature) elif parameters.feature_type == identify_types.QUANTILE: transformed_feature = np.zeros_like(feature) for i in six.moves.range(feature.shape[0]): transformed_feature[i] = cls.value_to_quantile( feature[i], parameters.quantiles ) feature = transformed_feature elif parameters.feature_type == identify_types.ENUM: possible_values = parameters.possible_values mapping = {} for i, possible_value in enumerate(possible_values): mapping[possible_value] = i output_feature = np.zeros((len(feature), len(possible_values))) for i, val in enumerate(feature): if abs(val - MISSING_VALUE) < 1e-2: # This check is required by the PT preprocessing but not C2 continue output_feature[i][mapping[val]] = 1.0 return output_feature elif parameters.feature_type == identify_types.CONTINUOUS_ACTION: min_value = parameters.min_value max_value = parameters.max_value feature = ( (feature - min_value) * ((1 - 1e-6) * 2 / (max_value - min_value)) - 1 + 1e-6 ) else: feature = feature - parameters.mean feature /= parameters.stddev feature = np.clip(feature, MIN_FEATURE_VALUE, MAX_FEATURE_VALUE) feature *= is_not_empty return feature
def transform(self, x): x = np.asarray(x) if len(x.shape) == 1: x = x[:, np.newaxis] elif len(x.shape) != 2: print "Data should be a 1-d list of samples to transform or a 2d array with samples as rows." if x.shape[1] != len(self.taus): print "%d variables in test data, but %d variables were in training data." % (x.shape[1], len(self.taus)) if self.strategy == 'lambert': return np.array([w_t(x_i, tau_i) for x_i, tau_i in zip(x.T, self.taus)]).T elif self.strategy == 'brute': return np.array([norm.ppf((rankdata(x_i) - 0.5) / len(x_i)) for x_i in x.T]).T elif self.strategy == 'boxcox': return np.array([boxcox(x_i, lmbda=lmbda_i) for x_i, lmbda_i in zip(x.T, self.taus)]).T else: raise NotImplementedError
def fit(self, x): x = np.asarray(x) if len(x.shape) == 1: x = x[:, np.newaxis] elif len(x.shape) != 2: print "Data should be a 1-d list of samples to transform or a 2d array with samples as rows." if self.strategy == 'lambert': for x_i in x.T: self.taus.append(igmm(x_i, tol=self.tol, max_iter=self.max_iter)) elif self.strategy == 'brute': for x_i in x.T: self.taus.append(None) # TODO: In principle, we could store parameters to do a quasi-invert elif self.strategy == 'boxcox': for x_i in x.T: self.taus.append(boxcox(x_i)[1]) else: raise NotImplementedError
def Preprocess_TransformNumericFeatures(self, dfall, trans_type ='boxcox', correction=0.00001): if self.num_features is None: raise TypeError("Execute the SetUpTrainTest method to use this feature") return if trans_type not in ['boxcox']: raise TypeError("Transformation type not supported") return self.lmbdaDict = {} for c in self.num_features: print 'Applying', trans_type + 'transformation on:', c if trans_type == 'boxcox': b = stats.boxcox(dfall[c]+ correction) dfall[c] = b[0] self.lmbdaDict[c]=b[1] return dfall
def _estimate_lambda_single_y(y): """Estimate lambda for a single y, given a range of lambdas through which to search. No validation performed. Parameters ---------- y : ndarray, shape (n_samples,) The vector being estimated against """ # ensure is array y = np.array(y) # Use scipy's log-likelihood estimator b = boxcox(y, lmbda=None) # Return lambda corresponding to maximum P return b[1]
def fit(self, x, y=None): """Fit a Gaussianizing transformation to each variable/column in x.""" x = np.asarray(x) if len(x.shape) == 1: x = x[:, np.newaxis] elif len(x.shape) != 2: print("Data should be a 1-d list of samples to transform or a 2d array with samples as rows.") if self.strategy == 'lambert': if self.verbose: print("Gaussianizing with Lambert method") for x_i in x.T: self.coefs_.append(igmm(x_i, tol=self.tol, max_iter=self.max_iter)) elif self.strategy == 'brute': for x_i in x.T: self.coefs_.append(None) # TODO: In principle, we could store parameters to do a quasi-invert elif self.strategy == 'boxcox': for x_i in x.T: self.coefs_.append(boxcox(x_i)[1]) else: raise NotImplementedError return self
def boxcox_xform(X, scaling=True): """ robust version of boxcox transform. Handles negative data and very large values in the original data. :param X: data (numeric list, Pandas series or 1d np array) :param scaling: whether to normalize between 0 and 1 or not :return: Boxcox transform array, the abs(max value of the original data set), and the optimal lbda parameter, fp where fp = 'N' if the data has negative values and fp = 'P' if the data does not have negative values """ x_arr = np.array(list(X)) x_max = np.max(np.abs(x_arr)) if scaling is True else 1.0 if len(np.unique(x_arr)) > 0: if np.min(x_arr) <= 0.0: # shift and rescale print('use YJ transform: yj_xform(X)') return None else: # only positive values z = x_arr / x_max # scale to deal with overflow/underflow: values in (0, 1] y, lbda = sps.boxcox(z, lmbda=None, alpha=None) # lbda = _boxcox_opt(z) # y = _boxcox_xform(lbda, x_arr) return y, x_max, lbda else: print('boxcox_xform: no data') return None, None, None, None