Example #1
0
 def fit_transform(self, X, y=None):
     if len(X.shape) > 1:
         t = [stats.boxcox(X[:, col]) for col in range(X.shape[1])]
         xt, self.lambdas = zip(*t)
         return np.array(xt).T
     else:
         xt, self.lambdas = stats.boxcox(X)
         return np.array(xt)
Example #2
0
 def transform(self, X):
     X += self.shift
     if isinstance(self.lmbda, float):
         xb = boxcox(X, self.lmbda)
     else:
         xb = numpy.zeros(shape=X.shape)
         for j, lmb in enumerate(self.lmbda):
             xb[:, j] = boxcox(X[:, j], lmb)
     return (xb - self.xmean) / self.xstd
Example #3
0
 def better_loglikelihood(self,param_e):
     if self.result_producing_thing.typ=="emulator":
         self.result_producing_thing.emulate(param_e[0:-2])
     if self.result_producing_thing.typ=="swmm":
         self.result_producing_thing.run(param_e[0:-2])
     data=stats.boxcox((self.measurement>0)*self.measurement+0.01,0.35)
     mean=stats.boxcox((self.result_producing_thing.result>0)*self.result_producing_thing.result+0.01,0.35)
     covariance=param_e[-1]*self.cov_mat_b_base+\
         self.cov_mat_e_base*param_e[-2]
     lik=-0.5*np.linalg.slogdet(covariance)[1]-\
         0.5*np.dot(mean-data,np.linalg.solve(covariance,mean-data))-\
         0.5*self.t*np.log(2*np.pi)
     return lik
Example #4
0
def executeOneSetting(tensor, density, roundId, para):
    logger.info('density=%.2f, %2d-round starts.'%(density, roundId + 1))
    (numUser, numService, numTime) = tensor.shape
    dim = para['dimension']

    # initialization
    U = np.random.rand(numUser, dim)
    S = np.random.rand(numService, dim)
    p = np.zeros(numUser)
    q = np.zeros(numService)

    # run for each time slice
    for sliceId in xrange(numTime):
        # boxcox data transformation
        matrix = tensor[:, :, sliceId]
        dataVector = matrix[:]
        (transfVector, alpha) = stats.boxcox(dataVector[dataVector > 0])
        maxV = np.max(transfVector)
        minV = np.min(transfVector)
        transfMatrix = matrix.copy()
        transfMatrix[transfMatrix != -1] = stats.boxcox(transfMatrix[transfMatrix != -1], alpha)
        transfMatrix[transfMatrix != -1] = (transfMatrix[transfMatrix != -1] - minV) / (maxV - minV)

        # remove data entries to generate trainMatrix and testMatrix  
        seedID = roundId + sliceId * 100
        (trainMatrix, testMatrix) = evallib.removeEntries(matrix, density, seedID)
        trainMatrix = np.where(trainMatrix > 0, transfMatrix, 0)
        (testVecX, testVecY) = np.where(testMatrix)     
        testVec = matrix[testVecX, testVecY]

        # invocation to the prediction function
        startTime = time.clock() 
        predictedMatrix = AMF.predict(trainMatrix, U, S, p, q, para)     
        runningTime = float(time.clock() - startTime)

        # evaluate the estimation error  
        predVec = predictedMatrix[testVecX, testVecY]
        predVec = (maxV - minV) * predVec + minV
        predVec = evallib.argBoxcox(predVec, alpha)
        evalResult = evallib.errMetric(testVec, predVec, para['metrics'])
        result = (evalResult, runningTime)

        # dump the result at each density
        outFile = '%s%s_%s_result_%02d_%.2f_round%02d.tmp'%(para['outPath'], 
            para['dataName'], para['dataType'], sliceId + 1, density, roundId + 1)
        evallib.dumpresult(outFile, result)
        logger.info('sliceId=%02d done.'%(sliceId + 1))
        
    logger.info('density=%.2f, %2d-round done.'%(density, roundId + 1))
    logger.info('----------------------------------------------')
Example #5
0
    def test_fixed_lmbda(self):
        np.random.seed(12345)
        x = stats.loggamma.rvs(5, size=50) + 5
        xt = stats.boxcox(x, lmbda=1)
        assert_allclose(xt, x - 1)
        xt = stats.boxcox(x, lmbda=-1)
        assert_allclose(xt, 1 - 1/x)

        xt = stats.boxcox(x, lmbda=0)
        assert_allclose(xt, np.log(x))

        # Also test that array_like input works
        xt = stats.boxcox(list(x), lmbda=0)
        assert_allclose(xt, np.log(x))
Example #6
0
def auto_arima(endog, freq=None, d=None, D=None, max_p=5, max_q=5, max_P=2, max_Q=2, max_order=5, max_d=2, max_D=1, start_p=2, start_q=2, start_P=1, start_Q=1, stationary=False,
               ic="aic", stepwise=True, trace=False, approximation=None,
               test="adf", seasonal_test="ch", allowdrift=True, allowmean=True, lambda_parameter=None, *args, **kwargs):
        # Parameter Validity Check
    if np.any(np.isnan(endog)):
        raise ValueError("Missing Values in Series")
    origin_endog = endog
    if _is_using_pandas(endog, None):
        endog = np.asarray(endog)
    if len(endog) <= 10:
        raise ValueError("There are too few observations.")
    if np.any(np.isnan(endog)):
        raise ValueError("NaN values in endogenous not allowed")
    if np.all(endog == endog[0]):
        raise ValueError("The endogenous variable is a constant")
    if (not isinstance(freq, int)) or freq <= 1:
        raise ValueError("The frequency parameter must be a integer greater than 1")
    if lambda_parameter is not None:
        if lambda_parameter < 0:
            raise ValueError("The Lambda parameter must be positive")
        if not np.all(endog > 0):
            raise ValueError("Box-Cox Transformation can be only used on positive series.")
        endog = boxcox(endog, lambda_parameter)

    max_p = max_p if max_p <= floor(len(endog) / 3) else floor(len(endog) / 3)
    max_q = max_q if max_q <= floor(len(endog) / 3) else floor(len(endog) / 3)
    max_P = max_P if max_P <= floor(len(endog) / 3 / freq) else floor(len(endog) / 3 / freq)
    max_Q = max_Q if max_Q <= floor(len(endog) / 3 / freq) else floor(len(endog) / 3 / freq)
    if stationary:
        D = 0
        d = 0
    if freq == 1:
Example #7
0
    def test_alpha(self):
        np.random.seed(1234)
        x = stats.loggamma.rvs(5, size=50) + 5

        # Some regular values for alpha, on a small sample size
        _, _, interval = stats.boxcox(x, alpha=0.75)
        assert_allclose(interval, [4.004485780226041, 5.138756355035744])
        _, _, interval = stats.boxcox(x, alpha=0.05)
        assert_allclose(interval, [1.2138178554857557, 8.209033272375663])

        # Try some extreme values, see we don't hit the N=500 limit
        x = stats.loggamma.rvs(7, size=500) + 15
        _, _, interval = stats.boxcox(x, alpha=0.001)
        assert_allclose(interval, [0.3988867, 11.40553131])
        _, _, interval = stats.boxcox(x, alpha=0.999)
        assert_allclose(interval, [5.83316246, 5.83735292])
Example #8
0
    def test_mle(self):
        maxlog = stats.boxcox_normmax(self.x, method='mle')
        assert_allclose(maxlog, 1.758101, rtol=1e-6)

        # Check that boxcox() uses 'mle'
        _, maxlog_boxcox = stats.boxcox(self.x)
        assert_allclose(maxlog_boxcox, maxlog)
def readIn_PredictionData(fn,dfmax,dfmin,transformationFunction):
    df = pd.read_csv(fn,sep=",",header=False)

#    countij = 0
#    for i in range(0,len(df.columns)):
#        for j in range(i+1,min(i+5,len(df.columns))):
#            countij = countij+1
#            df['new'+str(countij)] = np.multiply(df[df.columns[i]],df[df.columns[j]])
        
    print len(df.columns)
    for i in range(0,len(df.columns)):
#        if df.columns[i] != "selection":
        if transformationFunction == "bin":
            df[df.columns[i]] =[(x if x < 31 else 50 ) for x in df[df.columns[i]]]
        elif transformationFunction == "binlog":
            df[df.columns[i]] =[(0.5 if x==0 else (x if x < 31 else 50) ) for x in df[df.columns[i]]]
            df[df.columns[i]] = np.log(df[df.columns[i]])
        elif transformationFunction == "log":
            df[df.columns[i]] =[(0.5 if x==0 else x) for x in df[df.columns[i]]]
            df[df.columns[i]] = np.log(df[df.columns[i]])
        elif transformationFunction == "sqrt":
            df[df.columns[i]] = np.sqrt(df[df.columns[i]])
        elif transformationFunction == "boxcox":
            df[df.columns[i]] = stats.boxcox(np.array(df[df.columns[i]]))[0]
        df[df.columns[i]] = normalize_predictioninput(np.array(df[df.columns[i]]),dfmax[i],dfmin[i])
    return df
def sgs(data, xsteps=10, ysteps=10,
        nugget_dist=10, x_col='x_m', y_col='y_m', flux_col='flux',
        transform_data=True, invert_transform=True):
    x = data.x_m.values
    y = data.y_m.values
    flux = data.flux.values
    if transform_data:
        flux, L =  scpstats.boxcox(flux)
    data = pd.DataFrame(np.c_[x, y, flux], columns=[x_col, y_col, flux_col])
    new_x = []
    new_y = []
    new_flux = []
    # create array for the output
    idx, grid, indexGrid, M = makePathAndGrid(data, xsteps, ysteps)
    for step in idx :
        point = [grid[0][step], grid[1][step]]
        model = kriging.krig_model(data, nugget_dist, x_col, y_col, flux_col)
        est = kriging.krig_sample(model, point)
        indexPoint = [indexGrid[0][step], indexGrid[1][step]]
        M[indexPoint[0], indexPoint[1]] = est
        x = np.r_[x, point[0]]
        new_x.append(x[-1])
        y = np.r_[y, point[1]]
        new_y.append(y[-1])
        flux = np.r_[flux, est]
        new_flux.append(flux[-1])
        data = pd.DataFrame(np.c_[x, y, flux], columns=[x_col, y_col, flux_col])

    if invert_transform and transform_data:
        M = invboxcox(M, L)
        new_flux = invboxcox(np.array(new_flux), L)

    return grid[0,:].reshape(M.shape), grid[1,:].reshape(M.shape), M, new_x, new_y, new_flux
Example #11
0
def processing(data):
    #构造新特征
    create_feature(data);
    #丢弃特征
    data.drop(to_drop,axis=1,inplace=True)
    
    #填充None值,因为在特征说明中,None也是某些特征的一个值,所以对于这部分特征的缺失值以None填充
    fill_none = ['MasVnrType','BsmtExposure','GarageType','MiscFeature']
    for col in fill_none:
        data[col].fillna('None',inplace=True);
        
    #对其他缺失值进行填充,离散型特征填充众数,数值型特征填充中位数
    na_col = data.dtypes[data.isnull().any()];
    for col in na_col.index:
        if na_col[col] != 'object':
            med = data[col].median();
            data[col].fillna(med,inplace=True);
        else:
            mode = data[col].mode()[0];
            data[col].fillna(mode,inplace=True);
    
    #对正态偏移的特征进行正态转换,numeric_col就是数值型特征,zero_col是含有零值的数值型特征
    #因为如果对含零特征进行转换的话会有各种各种的小问题,所以干脆单独只对非零数值进行转换
    numeric_col = data.skew().index;
    zero_col = data.columns[data.isin([0]).any()]
    for col in numeric_col:
        #对于那些condition特征,例如取值是0,1,2,3...那些我不作变换,因为意义不大
        if len(pd.value_counts(data[col])) <= 10 : continue; 
        #如果是含有零值的特征,则只对非零值变换,至于用哪种形式变换,boxcox会自动根据数据来调整
        if col in zero_col:       
            trans_data = data[data>0][col];
            before = abs(trans_data.skew());
            cox,_ = boxcox(trans_data)
            log_after = abs(Series(cox).skew());
            if log_after < before:
                data.loc[trans_data.index,col] = cox;
        #如果是非零值的特征,则全部作转换
        else:
            before = abs(data[col].skew());
            cox,_ = boxcox(data[col])
            log_after = abs(Series(cox).skew());
            if log_after < before:
                data.loc[:,col] = cox;
    #mapper值的映射转换
    for col,mapp in mapper.items():
        data.loc[:,col] = data[col].map(mapp);
 def transform(self, x):
     x = np.asarray(x)
     if self.method == 'lambert':
         return np.array([self.w_t(x_i, tp_i) for x_i, tp_i in zip(x.T, self.trans_params)]).T
     elif self.method == 'boxcox':
         return np.array([boxcox(x_i, tp_i) for x_i, tp_i in zip(x.T, self.trans_params)]).T
     else:
         raise NotImplementedError
def boxcoxtransform(dataframe, numeric_feats):
    lam=defaultdict(float)
    skewed_feats = dataframe[numeric_feats].apply(lambda x: skew(x.dropna()))
    skewed_feats = skewed_feats[skewed_feats > 0.25]
    skewed_feats = skewed_feats.index

    for feats in skewed_feats:
        dataframe[feats] = dataframe[feats] + 1
        dataframe[feats], lam[feats] = boxcox(dataframe[feats])
    return dataframe, lam
Example #14
0
    def test_lmbda_None(self):
        np.random.seed(1234567)
        # Start from normal rv's, do inverse transform to check that
        # optimization function gets close to the right answer.
        np.random.seed(1245)
        lmbda = 2.5
        x = stats.norm.rvs(loc=10, size=50000)
        x_inv = (x * lmbda + 1)**(-lmbda)
        xt, maxlog = stats.boxcox(x_inv)

        assert_almost_equal(maxlog, -1 / lmbda, decimal=2)
 def fit(self, x):
     x = np.asarray(x)
     if self.method == 'lambert':
         for x_i in x.T:
             self.trans_params.append(self.iterate_moments(x_i, tol=self.tol,
                                                           max_iter=self.max_iter))
     elif self.method == 'boxcox':
         for x_i in x.T:
             self.trans_params.append(boxcox(x_i)[1])
     else:
         raise NotImplementedError
def boxcox(x,y,y_label):
    box_cox, maxlog = stats.boxcox(y + abs(min(y)) + 1)
    regr.fit(x,box_cox)
    box_cox_predict = regr.predict(x)
    y_predict = inv_boxcox(box_cox_predict,maxlog) - abs(min(y)) - 1
    print "R squared: " + str(np.var(y_predict)/np.var(y))
    # Plot outputs
    fig = plt.figure()
    plt.scatter(y, y_predict, color='blue')
    plt.xlabel(y_label)
    plt.ylabel('predicted')
    plt.show()
def box_cox(df, lmbda=None, alpha=None):
    """
    Performs a Box-Cox Transformation on all columns (features) of a pandas
    dataframe. Currently, there is some ambiguity as to how to deal with
    non-positive values & I need to check this out: at the moment, I just centre
    the data so that min(value) > 0, for all features, as necessitated by
    the very nature of the Box-Cox Transformation.
    """
    df_tr = pd.DataFrame(columns=df.columns)  #initialize empty data frame with same features as df
    for val in list(df.columns):
        df_tr[val] = stats.boxcox(df[val] - min(df[val]) + 0.1,lmbda, alpha)[0] #populate dataframe with transformed data
    return df_tr
Example #18
0
def transform_data_to_gaussian_1D(feature_vector):
    """
    Takes not-necessarily any distributed data and transforms it
    to a gaussian distribution using the box-cox transform
    """
    import matplotlib.pyplot as plt
    x=feature_vector

    n=len(x)
    new_x,l=stats.boxcox(feature_vector)
        
    return new_x
Example #19
0
 def fit(self, X):
     xtrans = numpy.zeros(shape=X.shape)
     if len(X.shape) == 2:
         self.shift = -X.min(axis=0)
         self.shift[self.shift < 0] = 0
         self.shift += 3 * X.std(axis=0)
         X += self.shift
         self.lmbda = numpy.zeros(X.shape[1])
         for j in range(X.shape[1]):
             _, self.lmbda[j] = boxcox(X[:, j])
             self.lmbda[j] = max(self.lmbda[j], self.minlmbda)
             self.lmbda[j] = min(self.lmbda[j], self.maxlmbda)
             if numpy.abs(self.lmbda[j]) < 1e-4:
                 self.lmbda[j] = 0
                 print "changing lambda"
             xtrans[:, j] = boxcox(X[:, j], self.lmbda[j])
     elif len(X.shape) == 1:
         self.shift = max([1e-10, -X.min()])
         self.shift += 3 * X.std()
         X += self.shift
         xtrans, self.lmbda = boxcox(X)
     self.xmean = xtrans.mean(axis=0)
     self.xstd = xtrans.std(axis=0)
Example #20
0
def append_boxcox(data, cols, drop_old=False):
    """Apply boxcox transformations to a list of columns
    data: a pandas DataFrame
    cols: a list of column names for which to perform boxcox transformations
    """
    if isinstance(cols, basestring):
        cols = [cols]

    for col in cols:
        # boxcox also returns maxlog, the lambda param that is choosen
        # could be used for pipelining objects
        data[col + '_boxcox'] = stats.boxcox(data[col])[0]
        if drop_old:
            data.drop(col, axis=1, inplace=True)
Example #21
0
def boxcox(X):
    """
    Gaussianize X using the Box-Cox transformation: [samples x phenotypes]

    - each phentoype is brought to a positive schale, by first subtracting the minimum value and adding 1.
    - Then each phenotype transformed by the boxcox transformation
    """
    X_transformed = sp.zeros_like(X)
    maxlog = sp.zeros(X.shape[1])
    for i in range(X.shape[1]):
        i_nan = sp.isnan(X[:,i])
        values = X[~i_nan,i]
        X_transformed[i_nan,i] = X[i_nan,i]
        X_transformed[~i_nan,i], maxlog[i] = st.boxcox(values-values.min()+1.0)
    return X_transformed, maxlog
Example #22
0
def transform_features(x_train, x_test):
    """ Transform features using a boxcox transform. Remove vibrato features.
    Comptes the optimal value of lambda on the training set and applies this
    lambda to the testing set.

    Parameters
    ----------
    x_train : np.array [n_samples, n_features]
        Untransformed training features.
    x_test : np.array [n_samples, n_features]
        Untransformed testing features.

    Returns
    -------
    x_train_boxcox : np.array [n_samples, n_features_trans]
        Transformed training features.
    x_test_boxcox : np.array [n_samples, n_features_trans]
        Transformed testing features.
    """
    x_train = x_train[:, 0:6]
    x_test = x_test[:, 0:6]

    _, n_feats = x_train.shape

    x_train_boxcox = np.zeros(x_train.shape)
    lmbda_opt = np.zeros((n_feats,))

    eps = 1.0  # shift features away from zero
    for i in range(n_feats):
        x_train_boxcox[:, i], lmbda_opt[i] = boxcox(x_train[:, i] + eps)

    x_test_boxcox = np.zeros(x_test.shape)
    for i in range(n_feats):
        x_test_boxcox[:, i] = boxcox(x_test[:, i] + eps, lmbda=lmbda_opt[i])

    return x_train_boxcox, x_test_boxcox
Example #23
0
def mungeskewed(train, test, numeric_feats):
    ntrain = train.shape[0]
    test['loss'] = 0
    train_test = pd.concat((train, test)).reset_index(drop=True)
    # compute skew and do Box-Cox transformation (Tilli)
    skewed_feats = train[numeric_feats].apply(lambda x: skew(x.dropna()))
    print("\nSkew in numeric features:")
    print(skewed_feats)
    skewed_feats = skewed_feats[skewed_feats > 0.25]
    skewed_feats = skewed_feats.index

    for feats in skewed_feats:
        train_test[feats] = train_test[feats] + 1
        train_test[feats], lam = boxcox(train_test[feats])
    return train_test, ntrain
Example #24
0
 def preprocess_feature(cls, feature, parameters):
     is_not_empty = 1 - np.isclose(feature, MISSING_VALUE)
     if parameters.feature_type == identify_types.BINARY:
         # Binary features are always 1 unless they are 0
         return ((feature != 0) * is_not_empty).astype(np.float32)
     if parameters.boxcox_lambda is not None:
         feature = stats.boxcox(
             np.maximum(feature + parameters.boxcox_shift, BOX_COX_MARGIN),
             parameters.boxcox_lambda,
         )
     # No *= to ensure consistent out-of-place operation.
     if parameters.feature_type == identify_types.PROBABILITY:
         feature = np.clip(feature, 0.01, 0.99)
         feature = special.logit(feature)
     elif parameters.feature_type == identify_types.QUANTILE:
         transformed_feature = np.zeros_like(feature)
         for i in six.moves.range(feature.shape[0]):
             transformed_feature[i] = cls.value_to_quantile(
                 feature[i], parameters.quantiles
             )
         feature = transformed_feature
     elif parameters.feature_type == identify_types.ENUM:
         possible_values = parameters.possible_values
         mapping = {}
         for i, possible_value in enumerate(possible_values):
             mapping[possible_value] = i
         output_feature = np.zeros((len(feature), len(possible_values)))
         for i, val in enumerate(feature):
             if abs(val - MISSING_VALUE) < 1e-2:
                 # This check is required by the PT preprocessing but not C2
                 continue
             output_feature[i][mapping[val]] = 1.0
         return output_feature
     elif parameters.feature_type == identify_types.CONTINUOUS_ACTION:
         min_value = parameters.min_value
         max_value = parameters.max_value
         feature = (
             (feature - min_value) * ((1 - 1e-6) * 2 / (max_value - min_value))
             - 1
             + 1e-6
         )
     else:
         feature = feature - parameters.mean
         feature /= parameters.stddev
         feature = np.clip(feature, MIN_FEATURE_VALUE, MAX_FEATURE_VALUE)
     feature *= is_not_empty
     return feature
Example #25
0
    def transform(self, x):
        x = np.asarray(x)
        if len(x.shape) == 1:
            x = x[:, np.newaxis]
        elif len(x.shape) != 2:
            print "Data should be a 1-d list of samples to transform or a 2d array with samples as rows."
        if x.shape[1] != len(self.taus):
            print "%d variables in test data, but %d variables were in training data." % (x.shape[1], len(self.taus))

        if self.strategy == 'lambert':
            return np.array([w_t(x_i, tau_i) for x_i, tau_i in zip(x.T, self.taus)]).T
        elif self.strategy == 'brute':
            return np.array([norm.ppf((rankdata(x_i) - 0.5) / len(x_i)) for x_i in x.T]).T
        elif self.strategy == 'boxcox':
            return np.array([boxcox(x_i, lmbda=lmbda_i) for x_i, lmbda_i in zip(x.T, self.taus)]).T
        else:
            raise NotImplementedError
Example #26
0
    def fit(self, x):
        x = np.asarray(x)
        if len(x.shape) == 1:
            x = x[:, np.newaxis]
        elif len(x.shape) != 2:
            print "Data should be a 1-d list of samples to transform or a 2d array with samples as rows."

        if self.strategy == 'lambert':
            for x_i in x.T:
                self.taus.append(igmm(x_i, tol=self.tol, max_iter=self.max_iter))
        elif self.strategy == 'brute':
            for x_i in x.T:
                self.taus.append(None)  # TODO: In principle, we could store parameters to do a quasi-invert
        elif self.strategy == 'boxcox':
            for x_i in x.T:
                self.taus.append(boxcox(x_i)[1])
        else:
            raise NotImplementedError
Example #27
0
    def Preprocess_TransformNumericFeatures(self, dfall, trans_type ='boxcox', correction=0.00001):

        if self.num_features is None:
            raise TypeError("Execute the SetUpTrainTest method to use this feature")
            return           
        
        if trans_type not in ['boxcox']:
            raise TypeError("Transformation type not supported")
            return            

        self.lmbdaDict = {}
        for c in self.num_features:
            print 'Applying', trans_type + 'transformation on:', c
            if trans_type == 'boxcox':
                  b = stats.boxcox(dfall[c]+ correction)
                  dfall[c] = b[0]
                  self.lmbdaDict[c]=b[1]
        
        return dfall
Example #28
0
def _estimate_lambda_single_y(y):
    """Estimate lambda for a single y, given a range of lambdas
    through which to search. No validation performed.
    
    Parameters
    ----------

    y : ndarray, shape (n_samples,)
       The vector being estimated against
    """

    # ensure is array
    y = np.array(y)

    # Use scipy's log-likelihood estimator
    b = boxcox(y, lmbda=None)

    # Return lambda corresponding to maximum P
    return b[1]
    def fit(self, x, y=None):
        """Fit a Gaussianizing transformation to each variable/column in x."""
        x = np.asarray(x)
        if len(x.shape) == 1:
            x = x[:, np.newaxis]
        elif len(x.shape) != 2:
            print("Data should be a 1-d list of samples to transform or a 2d array with samples as rows.")

        if self.strategy == 'lambert':
            if self.verbose:
                print("Gaussianizing with Lambert method")
            for x_i in x.T:
                self.coefs_.append(igmm(x_i, tol=self.tol, max_iter=self.max_iter))
        elif self.strategy == 'brute':
            for x_i in x.T:
                self.coefs_.append(None)  # TODO: In principle, we could store parameters to do a quasi-invert
        elif self.strategy == 'boxcox':
            for x_i in x.T:
                self.coefs_.append(boxcox(x_i)[1])
        else:
            raise NotImplementedError
        return self
Example #30
0
def boxcox_xform(X, scaling=True):
    """
    robust version of boxcox transform. Handles negative data and very large values in the original data.
    :param X: data (numeric list, Pandas series or 1d np array)
    :param scaling: whether to normalize between 0 and 1 or not
    :return: Boxcox transform array, the abs(max value of the original data set),  and the optimal lbda parameter, fp where
             fp = 'N' if the data has negative values and fp = 'P' if the data does not have negative values
    """
    x_arr = np.array(list(X))
    x_max = np.max(np.abs(x_arr)) if scaling is True else 1.0
    if len(np.unique(x_arr)) > 0:
        if np.min(x_arr) <= 0.0:    # shift and rescale
            print('use YJ transform: yj_xform(X)')
            return None
        else:                         # only positive values
            z = x_arr / x_max         # scale to deal with overflow/underflow: values in (0, 1]
            y, lbda = sps.boxcox(z, lmbda=None, alpha=None)
            # lbda = _boxcox_opt(z)
            # y = _boxcox_xform(lbda, x_arr)
            return y, x_max, lbda
    else:
        print('boxcox_xform: no data')
        return None, None, None, None