コード例 #1
0
def get_scaler(scale_method='StandardScaler'):
    """
  Get different kinds of scalers from scikit-learn

  :param scale_method: scale method
  :returns: scaler instance
  :raises: none
  """
    scaler = None

    if scale_method == 'StandardScaler':
        scaler = preprocessing.StandardScaler()

    elif scale_method == 'MinMaxScaler':
        scaler = preprocessing.MinMaxScaler()

    elif scale_method == 'MaxAbsScaler':
        scaler = preprocessing.MaxAbsScaler()

    elif scale_method == 'RobustScaler':
        scaler = preprocessing.RobustScaler()

    elif scale_method == 'QuantileTransformer':
        scaler = preprocessing.QuantileTransformer()

    elif scale_method == 'Normalizer':
        scaler = preprocessing.Normalizer()

    elif scale_method == 'PowerTransformer':
        scaler = preprocessing.PowerTransformer()

    else:
        print(scale_method, ' not found')

    return scaler
コード例 #2
0
def norm_and_zscore_ML_array(ML_array,
                             robust=False,
                             decomp=False,
                             gauss=False):
    """
    default preprocessing is simple MinMax L1 norm
    input
    ML_array : array shape : (Cut Trials, Features, Frames)   where Cut Trials refers to either the number of Trials
    inside the testing data or training data (Don't call this function for just the total ML data, split beforehand..)
    robust: boolean flag, use sci-kit learn robust scaling to normalize our data
    decomp : boolean flag, post-processing step used to return first whitened 20 PCA components to remove linear dependence
    gauss : boolean flag, use sci-kit learn gaussian distribution scaling to normalize our data
    """
    # ML_array
    if robust:
        pt = preprocessing.robust_scale()
    elif gauss:
        pt = preprocessing.PowerTransformer(method='box-cox',
                                            standardize=False)
    else:
        pt = preprocessing.MinMaxScaler()

    r_ML_array = pt.fit_transform(
        ML_array.reshape(ML_array.shape[0],
                         ML_array.shape[1] * ML_array.shape[2]))
    # apply normalization to feature axis
    if decomp:  # used to decomp linear correlations, if they exist.
        pca = decomposition.PCA(n=20, whiten=True)
        r_ML_array = pca.fit(r_ML_array)

    return r_ML_array
コード例 #3
0
def NormaliseColumnValues(df, normFactor):
    ''' If there is no normalisation needed, just return the DataFrame unmodified '''
    if normFactor == 'none':
        return df
    ''' Only consider numeric data '''
    colsToTransform = list(df.columns)
    colsToTransform.remove('Contig')

    contigNames = df.Contig

    if normFactor == 'unit':
        ''' Straight out of the preprocessing.scale documentation. '''
        df[colsToTransform] = preprocessing.scale(df[colsToTransform], axis=0)
        return df
    elif normFactor == 'yeojohnson':
        '''
            Taken from https://scikit-learn.org/stable/modules/preprocessing.html
            Since df has already been sorted, can just do an iloc slice of the values.
        '''
        pt = preprocessing.PowerTransformer(method='yeo-johnson',
                                            standardize=True)
        normArray = pt.fit_transform(df.iloc[:, 1:])
        ''' Recast the data as a DataFrame and return'''
        normDf = pd.DataFrame(normArray, columns=colsToTransform)
        normDf.insert(loc=0, column='Contig', value=contigNames)
        return normDf
    else:
        ''' Otherwise, none must have been chosen, return the frame unchanged. '''
        return df
コード例 #4
0
def scale_features(X, Y, name='Standard'):
    """Scale features for training and testing.

    Parameters
    ----------
    X : list
        Feature labels
    Y : list
        Target labels
    name : str
        Name of the scale to use

    Returns
    -------
    list
        Scaled feature labels
    """
    # Select feature scaler
    if name == 'MinMax':
        scaler = scalers.MinMaxScaler()
    elif name == 'Robust':
        scaler = scalers.RobustScaler()
    elif name == 'Quantile':
        scaler = scalers.QuantileTransformer()
    elif name == 'Power':
        scaler = scalers.PowerTransformer()
    elif name == 'Standard':
        scaler = scalers.StandardScaler()
    else:
        print('Invalid scale name, defaulting to Standard')
        scaler = scalers.StandardScaler()

    # Scale features
    return scaler.fit_transform(X, Y)
コード例 #5
0
def statistic_transform(df, **params):
    """
    特征变换
    :param df:
    :param params:
    :return:
    """
    import scipy.stats as spstats

    # for column in list(df.columns):
    #     # 计算最佳lamda值
    #     opt_lambda = spstats.boxcox(df[column])
    #     df[column] = spstats.boxcox(df[column], lmbda=opt_lambda)

    from sklearn import preprocessing
    import numpy as np
    import pandas as pd

    outlier_columns = DefaultConfig.outlier_columns
    for column in outlier_columns:
        df[column] = df[column].apply(lambda x: np.min(x) if x == np.nan else x)
        df[column] = df[column].apply(lambda x: 1e-5 if x == 0 else x)

    pt = preprocessing.PowerTransformer(method='box-cox', standardize=False)

    df[DefaultConfig.outlier_columns] = pd.DataFrame(columns=outlier_columns,
                                                     data=pt.fit_transform(df[outlier_columns]))

    return df
コード例 #6
0
 def normal_distribution_transformer(df):
     '''
     Non-linear Transformation
     '''
     transformer = preprocessing.PowerTransformer(method='yeo-johnson')
     df_transformed = pd.DataFrame(transformer.fit_transform(df), columns=df.columns)
     return df_transformed
コード例 #7
0
    def test_inv_transform_ct_22(self):
        """
        test inv_transform_ct with PowerTransformer Encoder Sklearn and passthrough option
        """
        y = pd.DataFrame(data=[0, 1], columns=['y'])

        train = pd.DataFrame({'num1': [0, 1],
                              'num2': [0, 2],
                              'other': ['A', 'B']})

        enc = ColumnTransformer(
            transformers=[
                ('power', skp.PowerTransformer(), ['num1', 'num2'])
            ],
            remainder='passthrough')
        enc.fit(train, y)
        test = pd.DataFrame({'num1': [0, 1, 1],
                             'num2': [0, 2, 3],
                             'other': ['A', 'B', 'C']})

        expected = pd.DataFrame({'power_num1': [0.0, 1.0, 1.0],
                                 'power_num2': [0.0, 1.9999999997665876, 3.000000000169985],
                                 'other': ['A', 'B', 'C']})

        result = pd.DataFrame(enc.transform(test))
        result.columns = ['col1', 'col2', 'other']
        original = inverse_transform(result, enc)
        pd.testing.assert_frame_equal(original, expected)
コード例 #8
0
def PowerTransformer(train_df, test_df, HP):
    method, standardize, copy = HP['PowerTransformer']['method'], HP['PowerTransformer']['standardize'], \
                                HP['PowerTransformer']['copy']

    train_x = train_df.iloc[:, :-1]
    train_y = train_df.iloc[:, -1:]
    test_x = test_df.iloc[:, :-1]
    test_y = test_df.iloc[:, -1:]

    transformer = preprocessing.PowerTransformer(method=method,
                                                 standardize=standardize,
                                                 copy=copy)
    train_x_copy = train_x.copy()
    train_x_transformed = transformer.fit_transform(train_x_copy)
    test_x_copy = test_x.copy()
    test_x_transformed = transformer.transform(test_x_copy)  # TODO check here

    train_column_name = list(train_x_copy.columns)
    test_column_name = list(test_x_copy.columns)

    train_x_transformed_df = pd.DataFrame(train_x_transformed)
    train_x_transformed_df.columns = train_column_name
    train_df_transformed = train_x_transformed_df.assign(label=train_y.values)

    test_x_transformed_df = pd.DataFrame(test_x_transformed)
    test_x_transformed_df.columns = test_column_name
    test_df_transformed = test_x_transformed_df.assign(label=test_y.values)

    return train_df_transformed, test_df_transformed
コード例 #9
0
def standardizing(df, methods):
    '''
    This function takes in  a dataframe and a method for standardizing, it
    returns the standardized dataframe.

    The methods are:
         - z: for z-scores
         - mm: for min-max
         - robust: for robust
         - gauss: for gaussian
    '''

    if methods == 'z':
        scaler = preprocessing.StandardScaler().fit(df)
        scaled_df = pd.DataFrame(scaler.transform(df))
    elif methods == 'mm':
        scaler = preprocessing.MinMaxScaler().fit(df)
        scaled_df = pd.DataFrame(scaler.transform(df))
    elif methods == 'robust':
        scaler = preprocessing.RobustScaler().fit(df)
        scaled_df = pd.DataFrame(scaler.transform(df))
    else:
        scaler = preprocessing.PowerTransformer(method='yeo-johnson',
                                                standardize=True)
        scaled_df = pd.DataFrame(scaler.fit_transform(df))

    return scaled_df
コード例 #10
0
def calculateCrimeRates(crime_weather_agg, cr_type=False):
    '''
    Calculate, standardize, and classify crime rates per 100,000 people.

    This function only standardizes and classifies crime rates for all crimes,
    not by crime type.
    '''
    #Remove blank Census data.
    crime_weather_agg_na = crime_weather_agg.dropna(
        axis='index', how='any', subset=['TotalPop']).reset_index(drop=True)

    #Calculate crime rates per 100,000 people.
    crime_weather_agg_na['crime_rate'] = (
        crime_weather_agg_na['crime_counts'] /
        crime_weather_agg_na['TotalPop']) * 100000

    if cr_type == False:
        #Standardize crime rates.
        standardize = crime_weather_agg_na[['crime_rate']]
        power = preprocessing.PowerTransformer(method='box-cox',
                                               standardize=False)
        crime_weather_agg_na['crs'] = power.fit_transform(standardize)

        #Calculate statistics for classification.
        stats = crime_weather_agg_na[['crs']].describe().transpose()

        #Classify standardized crime rates.
        crime_weather_agg_na['crime_rate_cat'] = crime_weather_agg_na[[
            'crs'
        ]].apply(classifyCrimeRates, args=(stats[['mean', 'std']], ), axis=1)

    return crime_weather_agg_na
コード例 #11
0
 def _box_cox_transform(self, columns):
     """
     Perform top Box-Cox transformation
     """
     transformer = preprocessing.PowerTransformer('box-cox')
     self.scalers['box-cox'] = transformer
     self.output_df[columns] = transformer.fit_transform(
         self.output_df[columns])
コード例 #12
0
 def _yeo_johnson_transform(self, column):
     """
     Perform top Robust Scaling
     """
     transformer = preprocessing.PowerTransformer('yeo-johnson')
     self.scalers['robust'] = transformer
     self.output_df[column] = transformer.fit_transform(
         self.output_df[column])
コード例 #13
0
 def __init__(self, column, dataframe, settings):
     APreprocessor.__init__(self, column, dataframe, settings)
     self.scaler = preprocessing.PowerTransformer(
         method=self.settings.get('method', 'yeo-johnson'),
         standardize=self.settings.get('standardize', True),
         copy=self.settings.get('copy', True)
     )
     self.pickle_process(dataframe)
コード例 #14
0
 def _power_transform(self):
     for c in self.num_feats:
         powt = preprocessing.PowerTransformer()
         powt.fit(self.df[c].values.reshape(-1, 1))
         self.output_df.loc[:,
                            c] = powt.transform(self.df[c].values.reshape(
                                -1, 1))
         self.power_transform_encoder[c] = powt
     return self.output_df, self.power_transform_encoder
コード例 #15
0
def scaleData(houseData):
    #le originál data
    dataX = houseData
    #i think this makes it so that majority of the data is within plus or minus 1, but outliers are included
    scaledXnormal = preprocessing.scale(dataX)
    #reduces skewness by applying a logarithmic scale?
    pt = preprocessing.PowerTransformer()
    scaledXpowerTransformer = pt.fit_transform(dataX)

    return dataX, scaledXnormal, scaledXpowerTransformer
コード例 #16
0
    def scaler(self, method='yeo-johnson'):
        '''
        Scale data to gaussian distribution N(0,1)

        Parameters
        ----------
        column_name : string
            Name of the column to scale data.
        method : string, optional
            Method to use for scaling transformation. The default is 'yeo-johnson'.

        Returns
        -------
        dataframe : DataFrame
            Return updated dataframe of the missing data from the column.
        scaler: object
            scaler created with the data.
            
        '''

        if method == 'standard': scaler = preprocessing.StandardScaler()
        if method == 'minmax': scaler = preprocessing.MinMaxScaler()
        if method == 'maxabs': scaler = preprocessing.MaxAbsScaler()
        if method == 'robust': scaler = preprocessing.RobustScaler()
        if method == 'quantile':
            scaler = preprocessing.QuantileTransformer(
                output_distribution='normal')

        if method == 'l1': scaler = preprocessing.normalize(method)
        if method == 'l2': scaler = preprocessing.normalize(method)
        if method == 'max': scaler = preprocessing.normalize(method)

        feature_sign = self._check_sign_feature()
        if method == 'box-cox' or feature_sign == 'positive':
            scaler = preprocessing.PowerTransformer(method)
        if method == 'yeo-johnson' or feature_sign == 'negative':
            scaler = preprocessing.PowerTransformer(method)

        scaler.fit(self.dataframe)
        self.dataframe = scaler.transform(self.dataframe)

        return self.dataframe, scaler
コード例 #17
0
def gaussian_scaler(train, test):
    scaler = skl.PowerTransformer(method='yeo-johnson')
    scaler.fit(train)
    
    train_scaled = pd.DataFrame(scaler.transform(train), 
                    columns=train.columns, index=train.index)
    
    test_scaled = pd.DataFrame(scaler.transform(test), 
                    columns=test.columns, index=test.index)
    
    return scaler, train_scaled, test_scaled
コード例 #18
0
def scale_input_data(data):
    input_a = np.array(data)
    input_a = input_a.reshape(-1, 1)
    mean_all = preprocessing.PowerTransformer()
    mean_all = mean_all.fit(input_a)
    mean_all = mean_all.transform(input_a)
    mean_all = preprocessing.MinMaxScaler(feature_range=(-1, 1))
    new_mean_all = mean_all.fit_transform(input_a)
    last_values = []
    for i in range(len(new_mean_all)):
        last_values.append(float(new_mean_all[i]))
    return last_values
 def allScalers():
     return {
         # "None": None,
         # "QuantileTransformerUniform": preprocessing.QuantileTransformer(output_distribution='uniform'),
         # "QuantileTransformerNormal": preprocessing.QuantileTransformer(output_distribution='normal'),
         "PowerTransformer": preprocessing.PowerTransformer(),
         # "RobustScaler": preprocessing.RobustScaler(),
         # "MaxAbsScaler": preprocessing.MaxAbsScaler(),
         # "MinMaxScaler": preprocessing.MinMaxScaler(),
         # "Normalizer": preprocessing.Normalizer(),
         # "StandardScaler": preprocessing.StandardScaler(),
     }
コード例 #20
0
ファイル: Classification.py プロジェクト: MenghsuanLiu/Python
def DataNormalize(data, method = "StandardScaler"):
    if method.lower() == "standardscaler":
        scaler = preprocessing.StandardScaler().fit(data)
    if method.lower() == "minmaxscaler":
        scaler = preprocessing.MinMaxScaler().fit(data)    
    if method.lower() == "maxabsscaler":
        scaler = preprocessing.MaxAbsScaler().fit(data)
    if method.lower() == "robustscaler":    
        scaler = preprocessing.RobustScaler().fit(data)
    if method.lower() == "powertransformer":
        scaler = preprocessing.PowerTransformer(method = "yeo-johnson", standardize = True).fit(data)
    return scaler.transform(data)
コード例 #21
0
 def get_scaler(self, scalername=None):
     if scalername is None:
         scalername = self.scaler
     return {
         'NoScaler': None,
         'MinMaxScaler': preprocessing.MinMaxScaler(),
         'MaxAbsScaler': preprocessing.MaxAbsScaler(),
         'StandardScaler': preprocessing.StandardScaler(),
         'RobustScaler': preprocessing.RobustScaler(),
         'Normalizer': preprocessing.Normalizer(),
         'QuantileTransformer': preprocessing.QuantileTransformer(),
         'PowerTransformer': preprocessing.PowerTransformer()
     }.get(scalername)
コード例 #22
0
ファイル: feature_scale.py プロジェクト: jim-schwoebel/allie
def feature_scale(feature_scaler, X_train, y_train):

    # more information about these scalers can be found @
    # https://scikit-learn.org/stable/modules/preprocessing.html

    if feature_scaler == 'binarizer':
        # scale the X values in the set
        model = preprocessing.Binarizer()

    elif feature_scaler == 'one_hot_encoder':
        '''
		>>> enc.transform([['female', 'from US', 'uses Safari'],
			             	['male', 'from Europe', 'uses Safari']]).toarray()
			array([[1., 0., 0., 1., 0., 1.],
			       [0., 1., 1., 0., 0., 1.]])
		'''
        # This is on y values
        model = preprocessing.OneHotEncoder(handle_unknown='ignore')

    elif feature_scaler == 'maxabs':
        model = preprocessing.MaxAbsScaler()

    elif feature_scaler == 'minmax':
        model = preprocessing.MinMaxScaler()

    elif feature_scaler == 'normalize':
        # L2 normalization
        model = preprocessing.Normalizer()

    elif feature_scaler == 'poly':
        # scale the X values in the set
        model = PolynomialFeatures(2)

    elif feature_scaler == 'power_transformer':
        # scale the X values in the set
        model = preprocessing.PowerTransformer(method='yeo-johnson')

    elif feature_scaler == 'quantile_transformer_normal':
        # scale the X values in the set
        model = preprocessing.QuantileTransformer(output_distribution='normal')

    elif feature_scaler == 'robust':
        model = preprocessing.RobustScaler(quantile_range=(25, 75))

    elif feature_scaler == 'standard_scaler':
        # scale the X values in the set
        model = preprocessing.StandardScaler()

    return model
コード例 #23
0
    def impute_method_setup(
            self,
            random_state=DEFAULT_IMPUTER_RANDOM_STATE,
            add_indicator=DEFAULT_IMPUTER_ADD_INDICATOR,
            initial_strategy=DEFAULT_IMPUTER_INITIAL_STRATEGY,
            max_iter=DEFAULT_IMPUTER_MAX_ITER,
            estimator=DEFAULT_IMPUTER_ESTIMATOR,
            output_distribution=DEFAULT_TRANSFORMER_OUTPUT_DISTRIBUTION,
            transformer_method=DEFAULT_TRANSFORMER_METHOD,
            transformer_standardize=DEFAULT_TRANSFORMER_STANDARDIZE):
        """ Initialises the IterativeImputer, QuantileTransformer and PowerTransformer methods required 
            if missing data is to be imputed.
            
            Parameters are passed to the sklearn routines. Where this is being done it is noted below. 
            For further documentation on how these functions work, and what the parameters denote, 
            please refer to the sklearn documentation.

            IterativeImputer:    https://scikit-learn.org/stable/modules/generated/sklearn.impute.IterativeImputer.html
            QuantileTransformer: https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.QuantileTransformer.html
            PowerTransformer:    https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.PowerTransformer.html
            
            Args:
                random_state:           (int) (IterativeImputer & QuantileTransformer) seed for pseudo random number generator
                add_indicator:          (boolean) (IterativeImputer) if True adds a `MissingIndicator` transform to the stack
                initial_strategy:       (str) (IterativeImputer) define strategy to use for initialising missing values
                max_iter:               (int) (IterativeImputer) maximum number of imputation rounds to perform
                estimator:              (str) (IterativeImputer) estimator method to be used
                output_distribution:    (str) (QuantileTransformer) Marginal distribution for the transformed data
                transformer_method      (str) (PowerTransformer) method to use, 'box-cox' is default
                transformer_standardize (boolean) (PowerTransformer) select if zero-mean, unit-variance normalisation is applied, default is True

             Returns: None
        """

        # set the imputer options (if we are using them)
        self.imputer = IterativeImputer(random_state=random_state,
                                        add_indicator=add_indicator,
                                        initial_strategy=initial_strategy,
                                        max_iter=max_iter,
                                        verbose=self.verbose,
                                        estimator=estimator)

        # set the power transform options
        self.transformer_quantile = preprocessing.QuantileTransformer(
            output_distribution=output_distribution, random_state=random_state)

        # set the power transform options
        self.transformer_power = preprocessing.PowerTransformer(
            method=transformer_method, standardize=transformer_standardize)
コード例 #24
0
def train_scaler(data,
                 preprocessing_type,
                 n_channels,
                 seq_length,
                 toscale='channels'):
    """scaling data with different modes, apply detrend per sample per channel
    :param data the dataset as array
    :type data ndarray
    :param n_channels the number of channels
    :type n_channels int
    :param seq_length the length of a sequence of one single sensor
    :type seq_length int
    :param preprocessing_type mode used for scaling, e.g. 'standard', 'zero2one'
    :type preprocessing_type str
    :param toscale mode used for scaling the time sequence of each channel('channels')
    or each timestep/feature over the corresponding samples('samples')
    :type toscale str
    :returns scaled dataset
    :rtype ndarray
    """

    scaler = None

    if toscale == 'channels':
        data = prep.concatenate_samples(data, n_channels, seq_length)
        data = data.transpose()

    if preprocessing_type == 'standard':
        # Center to the median and component wise scale according to the interquartile range
        scaler = skpreprocessing.RobustScaler().fit(data)

    elif preprocessing_type == 'gaussian':
        scaler = skpreprocessing.PowerTransformer().fit(
            data)  # scale to range [0,1]

    elif preprocessing_type == 'zero2one':
        scaler = skpreprocessing.MinMaxScaler().fit(
            data)  # scale to range [0,1]

    elif preprocessing_type == 'neg_one2one':
        scaler = skpreprocessing.MinMaxScaler(feature_range=(-1, 1)).fit(
            data)  # scale to range [-1, 1]

    elif preprocessing_type == 'fourier_samples':
        scaler = skpreprocessing.MinMaxScaler(feature_range=(-1, 1)).fit(
            data)  # scale to range [-1, 1]

    return scaler
コード例 #25
0
def feature_transform(df, **params):
    """
    特征分布转换
    :param df:
    :param params:
    :return:
    """
    from sklearn import preprocessing
    import pandas as pd

    pt = preprocessing.PowerTransformer(method='box-cox', standardize=False)

    df[DefaultConfig.outlier_columns] = pd.DataFrame(columns=DefaultConfig.outlier_columns,
                                                     data=pt.fit_transform(df[DefaultConfig.outlier_columns]))

    return df
コード例 #26
0
    def add_power_transform_scaling(self, data: object, gaussian_like_scale_columns: List[str] = None,
                                    gaussian_like_method: str = "yeo-johnson", standardize=True):
        """
        Apply a power transform featurewise to make data more Gaussian-like.

        https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.PowerTransformer.html

        :param data: Input data to fit the model.
        :param gaussian_like_scale_columns: Columns to which we apply the scaling.
        :param gaussian_like_method: yeo-johnson or box-cox
        :param standardize: Set to True to apply zero-mean, unit-variance normalization to the output.
        :return:
        """
        scale = preprocessing.PowerTransformer(method=gaussian_like_method, standardize=standardize)

        self._normalization_steps.append((scale, gaussian_like_scale_columns))
コード例 #27
0
def map_to_Gaussian(df, methodType):
    '''
    Mapping non-Gaussian distribution to Gaussian

    Input:
    df: DataFrame
    methodType: 'yeo-johnson' or 'box-cox'

    Output:
    Mapped dataframe
    '''

    pt = preprocessing.PowerTransformer(method=methodType, standardize=False)
    df = pt.fit_transform(df)

    return df
コード例 #28
0
ファイル: utils.py プロジェクト: viviwang1008/ETH-ML
def scaleFeatures(data, opt='standard', **kwargs):
    from sklearn import preprocessing
    if opt == 'standard':
        scl = preprocessing.StandardScaler(**kwargs)
    elif opt == 'robust':
        scl = preprocessing.RobustScaler(**kwargs)
    elif opt == 'minmax':
        scl = preprocessing.MinMaxScaler(**kwargs)
    elif opt == 'norm':
        scl = preprocessing.Normalizer(**kwargs)
    elif opt == 'gaussian':  # doesn't work! no idea why
        scl = preprocessing.PowerTransformer(method='yeo-johnson')
    elif opt == 'quantile':
        scl = preprocessing.QuantileTransformer(output_distribution='normal')
    out = pd.DataFrame(scl.fit_transform(data), columns=data.columns)
    print("Features scaled using", opt, "scaling method!")
    return out
コード例 #29
0
def normalize_numeric_columns(training_data, test_data):
    normalish_columns = []
    other_positive_columns = []
    other_numeric_columns = []

    # Everything should be numeric at this point, so we can loop over all the columns
    for col in training_data.columns:
        if col == target_column or col == id_column:
            continue

        n, p = stats.normaltest(training_data[col])
        if p > .05:
            normalish_columns.append(col)
        elif (training_data[col] > 0).all():
            other_positive_columns.append(col)
        else:
            other_numeric_columns.append(col)

    if len(normalish_columns) > 0:
        scaler = preprocessing.StandardScaler().fit(
            training_data[normalish_columns])
        training_data[normalish_columns] = scaler.transform(
            training_data[normalish_columns])
        test_data[normalish_columns] = scaler.transform(
            test_data[normalish_columns])

    if len(other_positive_columns) > 0:
        transformer = preprocessing.PowerTransformer(
            method='box-cox',
            standardize=True).fit(training_data[other_positive_columns])
        training_data[other_positive_columns] = transformer.transform(
            training_data[other_positive_columns])
        test_data[other_positive_columns] = transformer.transform(
            test_data[other_positive_columns])

    if len(other_numeric_columns) > 0:
        rs = preprocessing.MinMaxScaler(feature_range=(0, 1)).fit(
            training_data[other_numeric_columns])
        training_data[other_numeric_columns] = rs.transform(
            training_data[other_numeric_columns])
        test_data[other_numeric_columns] = rs.transform(
            test_data[other_numeric_columns])

    return training_data, test_data
コード例 #30
0
def yeo_johnson(df):
    """
    Wrapper for sklearn's preprocessing.PowerTransformer (Yeo-Johnson Option)
    which can handle negative values


    Parameters
    ----------
    df : DataFrame


    Returns
    -------
    DataFrame
        Yeo-Johnson transformed
    """
    assert (isinstance(df, pd.DataFrame))
    pt = preprocessing.PowerTransformer(method='yeo-johnson',
                                        standardize=False)
    return pd.DataFrame(pt.fit_transform(df))