コード例 #1
0
def gaussian_scaler(train, validate, test):
    '''
    Accepts three dataframes and applies a transformer to convert values in each dataframe
    to a gaussian-like distribution. This function defaults to Yeo-Johnson standard normal distribution. 
    Columns containing object data types are dropped, as strings cannot be directly scaled.

    Parameters (train, validate, test) = three dataframes being scaled
    
    Returns (scaler, train_scaled, validate_scaled, test_scaled)
    '''
    train = train.select_dtypes(exclude=['object'])
    validate = validate.select_dtypes(exclude=['object'])
    test = test.select_dtypes(exclude=['object'])
    scaler = PowerTransformer(method='yeo-johnson',
                              standardize=False,
                              copy=True).fit(train)
    train_scaled = pd.DataFrame(scaler.transform(train),
                                columns=train.columns.values).set_index(
                                    [train.index.values])
    validate_scaled = pd.DataFrame(scaler.transform(validate),
                                   columns=validate.columns.values).set_index(
                                       [validate.index.values])
    test_scaled = pd.DataFrame(scaler.transform(test),
                               columns=test.columns.values).set_index(
                                   [test.index.values])
    return scaler, train_scaled, validate_scaled, test_scaled
コード例 #2
0
    def power_transformation(self):
        pt = PowerTransformer()
        pt.fit(self.training.select_dtypes(exclude='category'))
        temp = pd.DataFrame(
            pt.transform(self.training.select_dtypes(exclude='category')))
        temp.index = self.training.select_dtypes(exclude='category').index
        temp.columns = self.training.select_dtypes(exclude='category').columns
        for col in self.training.select_dtypes(include='category'):
            temp[col] = self.training[col]
        self.training = temp
        del temp
        temp = pd.DataFrame(
            pt.transform(self.unseen.select_dtypes(exclude='category')))
        temp.index = self.unseen.select_dtypes(exclude='category').index
        temp.columns = self.unseen.select_dtypes(exclude='category').columns
        for col in self.unseen.select_dtypes(include='category'):
            temp[col] = self.unseen[col]
        self.unseen = temp
        print(temp)

        def get_k_means_elbow_graph(ds, numerical, min_clust, max_clust):
            km = pd.DataFrame(columns=['num_clusters', 'inertia'])
            for i in range(min_clust, max_clust):
                kmeans = KMeans(n_clusters=i).fit(
                    self.training.select_dtypes(exclude='category'))
                km = km.append({
                    'num_clusters': i,
                    'inertia': kmeans.inertia_
                },
                               ignore_index=True)
            sb.lineplot(x=km['num_clusters'], y=km['inertia'])
            return
コード例 #3
0
def do_skewremoval(X_train, X_test):
    transformer = PowerTransformer()
    transformer.fit(X_train)
    X_train = transformer.transform(X_train)
    X_test = transformer.transform(X_test)

    return X_train, X_test
コード例 #4
0
def box_cox(x_train, x_test=None):
    bc = PowerTransformer(method='box-cox')
    bc = bc.fit(x_train)
    x_train_bc = bc.transform(x_train)
    if x_test is not None:
        x_test_bc = bc.transform(x_test)
    else:
        x_test_bc = None
    return (x_train_bc, x_test_bc)
コード例 #5
0
def gaussian_scaler(train, test, method='yeo-johnson'):
    scaler = PowerTransformer(method, standardize=False, copy=True).fit(train)
    train_scaled = pd.DataFrame(scaler.transform(train),
                                columns=train.columns.values).set_index(
                                    [train.index.values])
    test_scaled = pd.DataFrame(scaler.transform(test),
                               columns=test.columns.values).set_index(
                                   [test.index.values])
    return scaler, train_scaled, test_scaled
コード例 #6
0
def gaussian_scaler(train_data, test_data, method='yeo-johnson'):
    scaler = PowerTransformer(method, standardize=False,
                              copy=True).fit(train_data)
    test_scaled = pd.DataFrame(scaler.transform(test_data),
                               columns=test_data.columns,
                               index=test_data.index)
    train_scaled = pd.DataFrame(scaler.transform(train_data),
                                columns=train_data.columns,
                                index=train_data.index)
    return scaler, train_scaled, test_scaled
コード例 #7
0
def gaussian_scaler(X_train, X_test):
    # Creates a Gaussian Scaler object and fit Train Data 
    gaussian_scaler = PowerTransformer(method="yeo-johnson", standardize=False, copy=True).fit(X_train)
    # Scale Train Data and Convert to a Data Frame
    scaled_X_train = gaussian_scaler.transform(X_train)
    scaled_X_train = pd.DataFrame(scaled_X_train, columns=X_train.columns.values).set_index([X_train.index.values])
    # Scale Train and Convert to a Data Frame
    scaled_X_test = gaussian_scaler.transform(X_test)
    scaled_X_test = pd.DataFrame(scaled_X_test, columns=X_test.columns.values).set_index([X_test.index.values])
    return scaled_X_train, scaled_X_test, gaussian_scaler
コード例 #8
0
def power_transformer(dataset):
    train_set, test_set = split_train_test(dataset, percent_train)
    scaler = PowerTransformer()
    scaler.fit(train_set)
    scaled_train_set = pd.DataFrame(scaler.transform(train_set), columns = colnames)
    scaled_test_set = pd.DataFrame(scaler.transform(test_set), columns = colnames)
    scaled_df = pd.concat([scaled_train_set, scaled_test_set])
    X = scaled_df[predictors]
    Y = scaled_df[target]
    return X, Y, scaler
コード例 #9
0
def gaussian_scaler(train, test):
    scaler = PowerTransformer()
    scaler.fit(train)
    train = pd.DataFrame(scaler.transform(train),
                         columns=train.columns.values).set_index(
                             [train.index.values])
    test = pd.DataFrame(scaler.transform(test),
                        columns=test.columns.values).set_index(
                            [test.index.values])
    return scaler, train, test
コード例 #10
0
def gaussian_scaler(train, test, method='yeo-johnson'):
    """Transforms and then normalizes data.
       Takes in a train and test set, 
       yeo_johnson allows for negative data,
       box_cox allows positive data only.
       Zero_mean, unit variance normalized train and test.
    """
    scaler = PowerTransformer(method, standardize=False, copy=True).fit(train)
    train_scaled = pd.DataFrame(scaler.transform(train), columns=train.columns.values).set_index([train.index.values])
    test_scaled = pd.DataFrame(scaler.transform(test), columns=test.columns.values).set_index([test.index.values])
    return scaler, train_scaled, test_scaled
コード例 #11
0
def gaussian_scaler(X):
    train, test = split_my_data(X)
    scaler = PowerTransformer(method='box-cox', standardize=False,
                              copy=True).fit(train)
    train_scaled_data = pd.DataFrame(scaler.transform(train),
                                     columns=train.columns.values).set_index(
                                         [train.index.values])
    test_scaled_data = pd.DataFrame(scaler.transform(test),
                                    columns=test.columns.values).set_index(
                                        [test.index.values])
    return scaler, train_scaled_data, test_scaled_data
コード例 #12
0
ファイル: wine.py プロジェクト: urvigodha/PRESC
def transform(X_train, X_test):
    from sklearn.preprocessing import PowerTransformer

    transformer = PowerTransformer(method="yeo-johnson", standardize=False)
    transformer.fit(X_train)
    X_train_array = transformer.transform(X_train)
    X_train = pd.DataFrame(data=X_train_array,
                           index=X_train.index,
                           columns=X_train.columns)
    X_test_array = transformer.transform(X_test)
    X_test = pd.DataFrame(data=X_test_array,
                          index=X_test.index,
                          columns=X_test.columns)
    return X_train, X_test
コード例 #13
0
def gaussian_scaler(x_train, x_test):
    g_x_train_scaler = PowerTransformer(method='yeo-johnson',
                                        standardize=False,
                                        copy=True).fit(x_train[[
                                            'monthly_charges', 'tenure'
                                        ]])

    g_x_train_scaled = pd.DataFrame(g_x_train_scaler.transform(x_train),
                                    columns=x_train.columns.values).set_index(
                                        [x_train.index.values])
    g_x_test_scaled = pd.DataFrame(g_x_train_scaler.transform(x_test),
                                   columns=x_test.columns.values).set_index(
                                       [x_test.index.values])

    return g_x_train_scaled, g_x_test_scaled
コード例 #14
0
def gaussian_scaler(train, test):
    # create scaler object using yeo-johnson method and fit to train
    scaler = PowerTransformer(method='yeo-johnson',
                              standardize=False,
                              copy=True).fit(train)
    # apply to train
    train_scaled = pd.DataFrame(scaler.transform(train),
                                columns=train.columns.values).set_index(
                                    [train.index.values])
    # apply to test
    test_scaled = pd.DataFrame(scaler.transform(test),
                               columns=test.columns.values).set_index(
                                   [test.index.values])

    return train_scaled, test_scaled, scaler
コード例 #15
0
class BoxCox(BaseStep):
    """
    sklearn.PowerTransform(method='box-cox') implementation
    """
    def __init__(self, name='BoxCox'):
        super().__init__(name)
        self.inplace = True
        self.power = PowerTransformer(method='box-cox', standardize=False)

    def fit(self, X, y):
        """
        Fit
        """
        self.set_X(X)
        self.power.fit(X)

        return self.transform(X, y)

    def transform(self, X, y=None):
        """
        Transform
        """
        return self.power.transform(X), y

    def get_template_data(self):
        """
        Get template data
        """
        return {
            'lambdas': self.power.lambdas_,
            'has_zeros': len([l for l in self.power.lambdas_ if l == 0])
        }
コード例 #16
0
def get_processed_dataset(filepath):

    df_raw = get_dataset(filepath)
    flags = df_raw['FLAG']
    df_raw.drop(['FLAG'], axis=1, inplace=True)

    # df Original com NAN por 0
    df_zero = df_raw.copy()
    df_zero = df_zero.apply(lambda row: row.fillna(0))
    """## Transformar Yeo - Johson"""

    # transformar com Yeo - Johson
    pt = PowerTransformer(method='yeo-johnson', standardize=False)
    skl_yeojohnson = pt.fit(df_zero.values)
    lambdas_found = skl_yeojohnson.lambdas_
    skl_yeojohnson = pt.transform(df_zero.values)
    df_yj = pd.DataFrame(data=skl_yeojohnson,
                         columns=df_zero.columns,
                         index=df_zero.index)
    """## Aplicar Z-score"""

    # aplizar Z-score
    df_zscore = pd.DataFrame(data=zscore(df_yj),
                             columns=df_zero.columns,
                             index=df_zero.index)
    df_zscore['flag'] = flags
    return df_zscore.iloc[:, 5:]
コード例 #17
0
    def dataloader(self):

        cols_drop = [
            "actual_load",
        ]
        X_train = self.train.drop(columns=cols_drop)
        y_train = self.train.actual_load
        X_test = self.test.drop(columns=cols_drop)
        y_test = self.test.actual_load
        X_val = self.val.drop(columns=cols_drop)
        y_val = self.val.actual_load

        if self.transform is not None:
            scaler = PowerTransformer(method="box-cox")
            y_train = scaler.fit_transform(
                np.array(self.train.actual_load).reshape(-1, 1))
            y_train = y_train.ravel()

            y_val = scaler.transform(
                np.array(self.val.actual_load).reshape(-1, 1))
            y_val = y_val.ravel()

            # Saving sklearn transformation file to be further used for inverse transformation in test.py
            scaler_filename = SCALER_FILENAME
            joblib.dump(scaler, scaler_filename)

        return X_train, y_train, X_val, y_val, X_test, y_test
コード例 #18
0
class BoxCox(Primitive):
    """ Power Transform primitive.

        The class applies BoxCox power transformation to make the selected features
        have normal distribution.

        # Arguments
            transformer: PowerTransformer. Instance of scikit-learn PowerTransformer
            object
    """
    transformer = None
    supported_ops = ('add', 'upd')

    def _fit(self, data, y=None):
        self.transformer = PowerTransformer()
        self.transformer.fit(data.X[self.selected], y)
        return self

    def _transform(self, data, y=None):
        x_tr = self.transformer.transform(data.X[self.selected])
        data.update(self.operation,
                    self.selected,
                    x_tr,
                    new_type='NUM',
                    key=self.name_key)
        return data
コード例 #19
0
def augmentation(X, Y, noise = False, bootstrapping = True, noiseSTD = [0.1/2, 0.1/2, 0.01/2, 0.0002/2,0.01/2,0.02/2], nr_boot =1000, bootstrap_bl_size = 488, boot_freq = 100):
    
    if noise:
        Xn = X.copy()
        for i, j, k in np.ndindex(X.shape):
            Xn[i, j, k] += np.random.normal(0, 1)*noiseSTD[k] 

        X = np.vstack([X, Xn])
        Y = np.vstack([Y, Y])
        
    if bootstrapping:
        Xb = X.copy()
        pt = PowerTransformer(method='yeo-johnson', standardize=True)
        
        for i in range(Xb.shape[0]):
            pt.fit(Xb[i])
            lambda_param = pt.lambdas_
            transformed = pt.transform(Xb[i])
            result = seasonal_decompose(transformed, model='additive', freq=boot_freq)
            
            # Moving Block Bootstrap on Residuals
            bootstrapRes = MBB(bootstrap_bl_size, result.resid)
            for data in bootstrapRes.bootstrap(nr_boot):
                bs_x = data[0][0]
            
            reconSeriesYC = result.trend + result.seasonal + bs_x
            Xb[i] = pt.inverse_transform(reconSeriesYC)
        
        for i,j,k in np.ndindex(X.shape):
            if np.isnan(Xb[i,j,k]):
                Xb[i,j,k] = X[i,j,k]
        X = np.vstack([X, Xb])
        Y = np.vstack([Y, Y])

    return X, Y
コード例 #20
0
def main():
    df = _helper.data()
    if df.empty:
        raise ValueError('Data Loading failed !')
    else:
        pass
    for c in col:
        if c in df:
            df[c] =df[c].astype('int64')
            features = df[[c]]
            pt = PowerTransformer(method='yeo-johnson', standardize=True,) 
            #Fit the data to the powertransformer
            pt_yeojohnson = pt.fit(features)
            #Transform the data 
            pt_yeojohnson = pt.transform(features)
            #Pass the transformed data into a new dataframe 
            df_xt = pd.DataFrame(data=pt_yeojohnson, columns=[c + '_yeojohn'])
            df=df.join(df_xt)
            

        else:
            Pass
    
    
    return _helper.publish(df)
コード例 #21
0
def df_power_transformer(df):
    from sklearn.preprocessing import PowerTransformer
    power_transform_scaler = PowerTransformer().fit(df)
    df = pd.DataFrame(power_transform_scaler.transform(df), columns=df.columns)
    print("DataSet QuantileScaled...")
    df.head()
    return df
コード例 #22
0
ファイル: ingest.py プロジェクト: bhayden53/calcloud
def transformer(inputs):
    """applies yeo-johnson power transform to first two indices of array (n_files, total_mb) using lambdas, mean and standard deviation calculated for each variable prior to model training.

    Returns: X inputs as 2D-array for generating predictions
    """
    X = inputs
    n_files = X[0]
    total_mb = X[1]
    # apply power transformer normalization to continuous vars
    x = np.array([[n_files], [total_mb]]).reshape(1, -1)
    pt = PowerTransformer(standardize=False)
    pt.lambdas_ = np.array([-1.51, -0.12])
    xt = pt.transform(x)
    # normalization (zero mean, unit variance)
    f_mean, f_sigma = 0.5682815234265285, 0.04222565843608133
    s_mean, s_sigma = 1.6250374589283951, 1.0396138451086632
    x_files = np.round(((xt[0, 0] - f_mean) / f_sigma), 5)
    x_size = np.round(((xt[0, 1] - s_mean) / s_sigma), 5)
    # print(f"Power Transformed variables: {x_files}, {x_size}")
    X_values = {
        "x_files": x_files,
        "x_size": x_size,
        "drizcorr": X[2],
        "pctecorr": X[3],
        "crsplit": X[4],
        "subarray": X[5],
        "detector": X[6],
        "dtype": X[7],
        "instr": X[8],
    }
    # X = np.array([x_files, x_size, X[2], X[3], X[4], X[5], X[6], X[7], X[8]])
    return X_values
コード例 #23
0
def process_smiles_features(chemical_features):
    
    db = chemical_features.copy()
    
    # Bonds Number
    db.bonds_number = db.bonds_number.apply(lambda x: np.log1p(x))
    
    minmax = MinMaxScaler()
    minmax.fit(db[["bonds_number"]])
    db[["bonds_number"]] = minmax.transform(db[["bonds_number"]])
    
    # Atom Number
    db.atom_number = db.atom_number.apply(lambda x: np.log1p(x))
    
    minmax = MinMaxScaler()
    minmax.fit(db[["atom_number"]])
    db[["atom_number"]] = minmax.transform(db[["atom_number"]])
    
    # Molecular Weight
    db.Mol = db.Mol.apply(lambda x: np.log1p(x))
    
    minmax = MinMaxScaler()
    minmax.fit(db[["Mol"]])
    db[["Mol"]] = minmax.transform(db[["Mol"]])
    
    # Water Solubility
    pt = PowerTransformer(method = 'box-cox')

    pt.fit(db.WaterSolubility.values.reshape(-1, 1))
    db[['WaterSolubility']] = pt.transform(db.WaterSolubility.values.reshape(-1, 1)).ravel()
    
    return db
コード例 #24
0
class PreProcess(BaseEstimator):
    def __init__(self, classifier_type: str = 'MinMaxScaler'):

        self.classifier_type = classifier_type

    def fit(self, X, y=None):
        if self.classifier_type == 'StandardScaler':
            self.classifier_ = StandardScaler()
        elif self.classifier_type == 'MinMaxScaler':
            self.classifier_ = MinMaxScaler()
        elif self.classifier_type == 'MaxAbsScaler':
            self.classifier_ = MaxAbsScaler()
        elif self.classifier_type == 'RobustScaler':
            self.classifier_ = RobustScaler()
        elif self.classifier_type == 'QuantileTransformerUniform':
            self.classifier_ = QuantileTransformer(
                output_distribution="uniform")
        elif self.classifier_type == 'QuantileTransformerNormal':
            self.classifier_ = QuantileTransformer(
                output_distribution="normal")
        elif self.classifier_type == 'PowerTransformer':
            self.classifier_ = PowerTransformer(method="yeo-johnson")
        else:
            raise ValueError('Unkown classifier type.')
        self.classifier_.fit(X)
        return self

    def transform(self, X, y=None):
        return self.classifier_.transform(X)
コード例 #25
0
class PowerTransformerPrim(primitive):
    def __init__(self, random_state=0):
        super(PowerTransformerPrim, self).__init__(name='PowerTransformer')
        self.id = 12
        self.hyperparams = []
        self.type = 'feature preprocess'
        self.description = "Apply a power transform featurewise to make data more Gaussian-like. Power transforms are a family of parametric, monotonic transformations that are applied to make data more Gaussian-like. This is useful for modeling issues related to heteroscedasticity (non-constant variance), or other situations where normality is desired. Currently, PowerTransformer supports the Box-Cox transform and the Yeo-Johnson transform. The optimal parameter for stabilizing variance and minimizing skewness is estimated through maximum likelihood. Box-Cox requires input data to be strictly positive, while Yeo-Johnson supports both positive or negative data. By default, zero-mean, unit-variance normalization is applied to the transformed data."
        self.hyperparams_run = {'default': True}
        self.scaler = PowerTransformer()
        self.accept_type = 'c_t'

    def can_accept(self, data):
        return self.can_accept_c(data)

    def is_needed(self, data):
        # data = handle_data(data)
        # Update
        return True

    def fit(self, data):
        data = handle_data(data)
        self.scaler.fit(data['X'])

    def produce(self, data):
        output = handle_data(data)
        cols = list(output['X'].columns)
        cols = ["{}_pwrtrnsfrm".format(x) for x in cols]
        output['X'] = pd.DataFrame(self.scaler.transform(output['X']),
                                   columns=cols)
        final_output = {0: output}
        return final_output
コード例 #26
0
 def gaussian_trans_(self,
                     random_state,
                     X_samp,
                     listX,
                     distri='normal',
                     noise=0):
     if False:
         #x = np.hstack([np.random.standard_cauchy(size=(1000, 2)), np.random.normal(size=(1000, 2))])
         trans = Gaussianize(tol=1e-2, max_iter=10)
         trans.fit(X_samp)  # Learn the parameters for the transformation
         for i, X_ in enumerate(listX):
             if X_ is None:
                 continue
             y = trans.transform(
                 X_)  # Transform x to y, where y should be normal
             listX[i] = trans.inverse_transform(y).astype(
                 np.float32
             )  # Inverting this transform should recover the data
         #assert np.allclose(x_prime, x)
         #trans.qqplot(x,output_dir="E:/QuantumForest/dump/")  # Plot qq plots for each variable, before and after.
         #print()
     else:
         power = PowerTransformer(method='yeo-johnson').fit(X_samp)
         for i, X_ in enumerate(listX):
             if X_ is None:
                 continue
             listX[i] = power.transform(X_)
     return listX, power
コード例 #27
0
ファイル: prep.py プロジェクト: spacetelescope/calcloud
def update_power_transform(df):
    pt = PowerTransformer(standardize=False)
    df_cont = df[["n_files", "total_mb"]]
    pt.fit(df_cont)
    input_matrix = pt.transform(df_cont)
    # FILES (n_files)
    f_mean = np.mean(input_matrix[:, 0])
    f_sigma = np.std(input_matrix[:, 0])
    # SIZE (total_mb)
    s_mean = np.mean(input_matrix[:, 1])
    s_sigma = np.std(input_matrix[:, 1])
    files = input_matrix[:, 0]
    size = input_matrix[:, 1]
    x_files = (files - f_mean) / f_sigma
    x_size = (size - s_mean) / s_sigma
    normalized = np.stack([x_files, x_size], axis=1)
    idx = df_cont.index
    df_norm = pd.DataFrame(normalized,
                           index=idx,
                           columns=["x_files", "x_size"])
    df["x_files"] = df_norm["x_files"]
    df["x_size"] = df_norm["x_size"]
    lambdas = pt.lambdas_
    pt_transform = {
        "f_lambda": lambdas[0],
        "s_lambda": lambdas[1],
        "f_mean": f_mean,
        "f_sigma": f_sigma,
        "s_mean": s_mean,
        "s_sigma": s_sigma,
    }
    print(pt_transform)
    return df, pt_transform
コード例 #28
0
    def fit(self, dataframe: DataFrame) -> None:
        """Estimate and save the optimal parameter of PowerTransformer for each feature.
        Also store values require to scale and unscale the features if scaling needs to be apply.

        :param dataframe: dataframe containing only the features that needs to be normalize
        """
        for feature in list(dataframe):
            self._registered_features.append(feature)
            data = dataframe[feature].to_numpy().reshape(-1, 1)  # load feature into a numpy array

            self._transformers[feature] = {}                     # initialized storage for the feature transformers

            if self.to_log(data):                                # log features that should be log
                data = np.log(data)
                self._log_features.append(feature)

            power_transformer = PowerTransformer()               # log features that should be log
            power_transformer.fit(data)

            self._transformers[feature]['normalizer'] = power_transformer

            if self._scale:
                scaler = MinMaxScaler(feature_range=self._scale)
                scaler.fit(power_transformer.transform(data))
                self._transformers[feature]['scaler'] = scaler
コード例 #29
0
def gaussian_scaler(train, validate, test):
    '''
    This function scales data using a gaussian sclaler. This uses either the Box-Cox(positive data) 
    or Yeo-Johnson(negative and positibe data) method to transform to resemble normal or standard normal distrubtion.
    '''
    scaler = PowerTransformer(method='yeo-johnson',
                              standardize=False,
                              copy=True)
    train[['monthly_charges', 'tenure',
           'total_charges']] = scaler.fit_transform(
               train[['monthly_charges', 'tenure', 'total_charges']])
    validate[['monthly_charges', 'tenure',
              'total_charges']] = scaler.transform(
                  validate[['monthly_charges', 'tenure', 'total_charges']])
    test[['monthly_charges', 'tenure', 'total_charges']] = scaler.transform(
        test[['monthly_charges', 'tenure', 'total_charges']])
    return scaler, train, validate, test
def yeo_johnson_transf(data):
    pt = PowerTransformer(method='yeo-johnson', standardize=True)
    pt.fit(data)

    lambdas = pt.lambdas_
    df_yeojohnson = pd.DataFrame( pt.transform(data), columns=data.columns.values )
    
    return df_yeojohnson, lambdas