def gaussian_scaler(train, validate, test): ''' Accepts three dataframes and applies a transformer to convert values in each dataframe to a gaussian-like distribution. This function defaults to Yeo-Johnson standard normal distribution. Columns containing object data types are dropped, as strings cannot be directly scaled. Parameters (train, validate, test) = three dataframes being scaled Returns (scaler, train_scaled, validate_scaled, test_scaled) ''' train = train.select_dtypes(exclude=['object']) validate = validate.select_dtypes(exclude=['object']) test = test.select_dtypes(exclude=['object']) scaler = PowerTransformer(method='yeo-johnson', standardize=False, copy=True).fit(train) train_scaled = pd.DataFrame(scaler.transform(train), columns=train.columns.values).set_index( [train.index.values]) validate_scaled = pd.DataFrame(scaler.transform(validate), columns=validate.columns.values).set_index( [validate.index.values]) test_scaled = pd.DataFrame(scaler.transform(test), columns=test.columns.values).set_index( [test.index.values]) return scaler, train_scaled, validate_scaled, test_scaled
def power_transformation(self): pt = PowerTransformer() pt.fit(self.training.select_dtypes(exclude='category')) temp = pd.DataFrame( pt.transform(self.training.select_dtypes(exclude='category'))) temp.index = self.training.select_dtypes(exclude='category').index temp.columns = self.training.select_dtypes(exclude='category').columns for col in self.training.select_dtypes(include='category'): temp[col] = self.training[col] self.training = temp del temp temp = pd.DataFrame( pt.transform(self.unseen.select_dtypes(exclude='category'))) temp.index = self.unseen.select_dtypes(exclude='category').index temp.columns = self.unseen.select_dtypes(exclude='category').columns for col in self.unseen.select_dtypes(include='category'): temp[col] = self.unseen[col] self.unseen = temp print(temp) def get_k_means_elbow_graph(ds, numerical, min_clust, max_clust): km = pd.DataFrame(columns=['num_clusters', 'inertia']) for i in range(min_clust, max_clust): kmeans = KMeans(n_clusters=i).fit( self.training.select_dtypes(exclude='category')) km = km.append({ 'num_clusters': i, 'inertia': kmeans.inertia_ }, ignore_index=True) sb.lineplot(x=km['num_clusters'], y=km['inertia']) return
def do_skewremoval(X_train, X_test): transformer = PowerTransformer() transformer.fit(X_train) X_train = transformer.transform(X_train) X_test = transformer.transform(X_test) return X_train, X_test
def box_cox(x_train, x_test=None): bc = PowerTransformer(method='box-cox') bc = bc.fit(x_train) x_train_bc = bc.transform(x_train) if x_test is not None: x_test_bc = bc.transform(x_test) else: x_test_bc = None return (x_train_bc, x_test_bc)
def gaussian_scaler(train, test, method='yeo-johnson'): scaler = PowerTransformer(method, standardize=False, copy=True).fit(train) train_scaled = pd.DataFrame(scaler.transform(train), columns=train.columns.values).set_index( [train.index.values]) test_scaled = pd.DataFrame(scaler.transform(test), columns=test.columns.values).set_index( [test.index.values]) return scaler, train_scaled, test_scaled
def gaussian_scaler(train_data, test_data, method='yeo-johnson'): scaler = PowerTransformer(method, standardize=False, copy=True).fit(train_data) test_scaled = pd.DataFrame(scaler.transform(test_data), columns=test_data.columns, index=test_data.index) train_scaled = pd.DataFrame(scaler.transform(train_data), columns=train_data.columns, index=train_data.index) return scaler, train_scaled, test_scaled
def gaussian_scaler(X_train, X_test): # Creates a Gaussian Scaler object and fit Train Data gaussian_scaler = PowerTransformer(method="yeo-johnson", standardize=False, copy=True).fit(X_train) # Scale Train Data and Convert to a Data Frame scaled_X_train = gaussian_scaler.transform(X_train) scaled_X_train = pd.DataFrame(scaled_X_train, columns=X_train.columns.values).set_index([X_train.index.values]) # Scale Train and Convert to a Data Frame scaled_X_test = gaussian_scaler.transform(X_test) scaled_X_test = pd.DataFrame(scaled_X_test, columns=X_test.columns.values).set_index([X_test.index.values]) return scaled_X_train, scaled_X_test, gaussian_scaler
def power_transformer(dataset): train_set, test_set = split_train_test(dataset, percent_train) scaler = PowerTransformer() scaler.fit(train_set) scaled_train_set = pd.DataFrame(scaler.transform(train_set), columns = colnames) scaled_test_set = pd.DataFrame(scaler.transform(test_set), columns = colnames) scaled_df = pd.concat([scaled_train_set, scaled_test_set]) X = scaled_df[predictors] Y = scaled_df[target] return X, Y, scaler
def gaussian_scaler(train, test): scaler = PowerTransformer() scaler.fit(train) train = pd.DataFrame(scaler.transform(train), columns=train.columns.values).set_index( [train.index.values]) test = pd.DataFrame(scaler.transform(test), columns=test.columns.values).set_index( [test.index.values]) return scaler, train, test
def gaussian_scaler(train, test, method='yeo-johnson'): """Transforms and then normalizes data. Takes in a train and test set, yeo_johnson allows for negative data, box_cox allows positive data only. Zero_mean, unit variance normalized train and test. """ scaler = PowerTransformer(method, standardize=False, copy=True).fit(train) train_scaled = pd.DataFrame(scaler.transform(train), columns=train.columns.values).set_index([train.index.values]) test_scaled = pd.DataFrame(scaler.transform(test), columns=test.columns.values).set_index([test.index.values]) return scaler, train_scaled, test_scaled
def gaussian_scaler(X): train, test = split_my_data(X) scaler = PowerTransformer(method='box-cox', standardize=False, copy=True).fit(train) train_scaled_data = pd.DataFrame(scaler.transform(train), columns=train.columns.values).set_index( [train.index.values]) test_scaled_data = pd.DataFrame(scaler.transform(test), columns=test.columns.values).set_index( [test.index.values]) return scaler, train_scaled_data, test_scaled_data
def transform(X_train, X_test): from sklearn.preprocessing import PowerTransformer transformer = PowerTransformer(method="yeo-johnson", standardize=False) transformer.fit(X_train) X_train_array = transformer.transform(X_train) X_train = pd.DataFrame(data=X_train_array, index=X_train.index, columns=X_train.columns) X_test_array = transformer.transform(X_test) X_test = pd.DataFrame(data=X_test_array, index=X_test.index, columns=X_test.columns) return X_train, X_test
def gaussian_scaler(x_train, x_test): g_x_train_scaler = PowerTransformer(method='yeo-johnson', standardize=False, copy=True).fit(x_train[[ 'monthly_charges', 'tenure' ]]) g_x_train_scaled = pd.DataFrame(g_x_train_scaler.transform(x_train), columns=x_train.columns.values).set_index( [x_train.index.values]) g_x_test_scaled = pd.DataFrame(g_x_train_scaler.transform(x_test), columns=x_test.columns.values).set_index( [x_test.index.values]) return g_x_train_scaled, g_x_test_scaled
def gaussian_scaler(train, test): # create scaler object using yeo-johnson method and fit to train scaler = PowerTransformer(method='yeo-johnson', standardize=False, copy=True).fit(train) # apply to train train_scaled = pd.DataFrame(scaler.transform(train), columns=train.columns.values).set_index( [train.index.values]) # apply to test test_scaled = pd.DataFrame(scaler.transform(test), columns=test.columns.values).set_index( [test.index.values]) return train_scaled, test_scaled, scaler
class BoxCox(BaseStep): """ sklearn.PowerTransform(method='box-cox') implementation """ def __init__(self, name='BoxCox'): super().__init__(name) self.inplace = True self.power = PowerTransformer(method='box-cox', standardize=False) def fit(self, X, y): """ Fit """ self.set_X(X) self.power.fit(X) return self.transform(X, y) def transform(self, X, y=None): """ Transform """ return self.power.transform(X), y def get_template_data(self): """ Get template data """ return { 'lambdas': self.power.lambdas_, 'has_zeros': len([l for l in self.power.lambdas_ if l == 0]) }
def get_processed_dataset(filepath): df_raw = get_dataset(filepath) flags = df_raw['FLAG'] df_raw.drop(['FLAG'], axis=1, inplace=True) # df Original com NAN por 0 df_zero = df_raw.copy() df_zero = df_zero.apply(lambda row: row.fillna(0)) """## Transformar Yeo - Johson""" # transformar com Yeo - Johson pt = PowerTransformer(method='yeo-johnson', standardize=False) skl_yeojohnson = pt.fit(df_zero.values) lambdas_found = skl_yeojohnson.lambdas_ skl_yeojohnson = pt.transform(df_zero.values) df_yj = pd.DataFrame(data=skl_yeojohnson, columns=df_zero.columns, index=df_zero.index) """## Aplicar Z-score""" # aplizar Z-score df_zscore = pd.DataFrame(data=zscore(df_yj), columns=df_zero.columns, index=df_zero.index) df_zscore['flag'] = flags return df_zscore.iloc[:, 5:]
def dataloader(self): cols_drop = [ "actual_load", ] X_train = self.train.drop(columns=cols_drop) y_train = self.train.actual_load X_test = self.test.drop(columns=cols_drop) y_test = self.test.actual_load X_val = self.val.drop(columns=cols_drop) y_val = self.val.actual_load if self.transform is not None: scaler = PowerTransformer(method="box-cox") y_train = scaler.fit_transform( np.array(self.train.actual_load).reshape(-1, 1)) y_train = y_train.ravel() y_val = scaler.transform( np.array(self.val.actual_load).reshape(-1, 1)) y_val = y_val.ravel() # Saving sklearn transformation file to be further used for inverse transformation in test.py scaler_filename = SCALER_FILENAME joblib.dump(scaler, scaler_filename) return X_train, y_train, X_val, y_val, X_test, y_test
class BoxCox(Primitive): """ Power Transform primitive. The class applies BoxCox power transformation to make the selected features have normal distribution. # Arguments transformer: PowerTransformer. Instance of scikit-learn PowerTransformer object """ transformer = None supported_ops = ('add', 'upd') def _fit(self, data, y=None): self.transformer = PowerTransformer() self.transformer.fit(data.X[self.selected], y) return self def _transform(self, data, y=None): x_tr = self.transformer.transform(data.X[self.selected]) data.update(self.operation, self.selected, x_tr, new_type='NUM', key=self.name_key) return data
def augmentation(X, Y, noise = False, bootstrapping = True, noiseSTD = [0.1/2, 0.1/2, 0.01/2, 0.0002/2,0.01/2,0.02/2], nr_boot =1000, bootstrap_bl_size = 488, boot_freq = 100): if noise: Xn = X.copy() for i, j, k in np.ndindex(X.shape): Xn[i, j, k] += np.random.normal(0, 1)*noiseSTD[k] X = np.vstack([X, Xn]) Y = np.vstack([Y, Y]) if bootstrapping: Xb = X.copy() pt = PowerTransformer(method='yeo-johnson', standardize=True) for i in range(Xb.shape[0]): pt.fit(Xb[i]) lambda_param = pt.lambdas_ transformed = pt.transform(Xb[i]) result = seasonal_decompose(transformed, model='additive', freq=boot_freq) # Moving Block Bootstrap on Residuals bootstrapRes = MBB(bootstrap_bl_size, result.resid) for data in bootstrapRes.bootstrap(nr_boot): bs_x = data[0][0] reconSeriesYC = result.trend + result.seasonal + bs_x Xb[i] = pt.inverse_transform(reconSeriesYC) for i,j,k in np.ndindex(X.shape): if np.isnan(Xb[i,j,k]): Xb[i,j,k] = X[i,j,k] X = np.vstack([X, Xb]) Y = np.vstack([Y, Y]) return X, Y
def main(): df = _helper.data() if df.empty: raise ValueError('Data Loading failed !') else: pass for c in col: if c in df: df[c] =df[c].astype('int64') features = df[[c]] pt = PowerTransformer(method='yeo-johnson', standardize=True,) #Fit the data to the powertransformer pt_yeojohnson = pt.fit(features) #Transform the data pt_yeojohnson = pt.transform(features) #Pass the transformed data into a new dataframe df_xt = pd.DataFrame(data=pt_yeojohnson, columns=[c + '_yeojohn']) df=df.join(df_xt) else: Pass return _helper.publish(df)
def df_power_transformer(df): from sklearn.preprocessing import PowerTransformer power_transform_scaler = PowerTransformer().fit(df) df = pd.DataFrame(power_transform_scaler.transform(df), columns=df.columns) print("DataSet QuantileScaled...") df.head() return df
def transformer(inputs): """applies yeo-johnson power transform to first two indices of array (n_files, total_mb) using lambdas, mean and standard deviation calculated for each variable prior to model training. Returns: X inputs as 2D-array for generating predictions """ X = inputs n_files = X[0] total_mb = X[1] # apply power transformer normalization to continuous vars x = np.array([[n_files], [total_mb]]).reshape(1, -1) pt = PowerTransformer(standardize=False) pt.lambdas_ = np.array([-1.51, -0.12]) xt = pt.transform(x) # normalization (zero mean, unit variance) f_mean, f_sigma = 0.5682815234265285, 0.04222565843608133 s_mean, s_sigma = 1.6250374589283951, 1.0396138451086632 x_files = np.round(((xt[0, 0] - f_mean) / f_sigma), 5) x_size = np.round(((xt[0, 1] - s_mean) / s_sigma), 5) # print(f"Power Transformed variables: {x_files}, {x_size}") X_values = { "x_files": x_files, "x_size": x_size, "drizcorr": X[2], "pctecorr": X[3], "crsplit": X[4], "subarray": X[5], "detector": X[6], "dtype": X[7], "instr": X[8], } # X = np.array([x_files, x_size, X[2], X[3], X[4], X[5], X[6], X[7], X[8]]) return X_values
def process_smiles_features(chemical_features): db = chemical_features.copy() # Bonds Number db.bonds_number = db.bonds_number.apply(lambda x: np.log1p(x)) minmax = MinMaxScaler() minmax.fit(db[["bonds_number"]]) db[["bonds_number"]] = minmax.transform(db[["bonds_number"]]) # Atom Number db.atom_number = db.atom_number.apply(lambda x: np.log1p(x)) minmax = MinMaxScaler() minmax.fit(db[["atom_number"]]) db[["atom_number"]] = minmax.transform(db[["atom_number"]]) # Molecular Weight db.Mol = db.Mol.apply(lambda x: np.log1p(x)) minmax = MinMaxScaler() minmax.fit(db[["Mol"]]) db[["Mol"]] = minmax.transform(db[["Mol"]]) # Water Solubility pt = PowerTransformer(method = 'box-cox') pt.fit(db.WaterSolubility.values.reshape(-1, 1)) db[['WaterSolubility']] = pt.transform(db.WaterSolubility.values.reshape(-1, 1)).ravel() return db
class PreProcess(BaseEstimator): def __init__(self, classifier_type: str = 'MinMaxScaler'): self.classifier_type = classifier_type def fit(self, X, y=None): if self.classifier_type == 'StandardScaler': self.classifier_ = StandardScaler() elif self.classifier_type == 'MinMaxScaler': self.classifier_ = MinMaxScaler() elif self.classifier_type == 'MaxAbsScaler': self.classifier_ = MaxAbsScaler() elif self.classifier_type == 'RobustScaler': self.classifier_ = RobustScaler() elif self.classifier_type == 'QuantileTransformerUniform': self.classifier_ = QuantileTransformer( output_distribution="uniform") elif self.classifier_type == 'QuantileTransformerNormal': self.classifier_ = QuantileTransformer( output_distribution="normal") elif self.classifier_type == 'PowerTransformer': self.classifier_ = PowerTransformer(method="yeo-johnson") else: raise ValueError('Unkown classifier type.') self.classifier_.fit(X) return self def transform(self, X, y=None): return self.classifier_.transform(X)
class PowerTransformerPrim(primitive): def __init__(self, random_state=0): super(PowerTransformerPrim, self).__init__(name='PowerTransformer') self.id = 12 self.hyperparams = [] self.type = 'feature preprocess' self.description = "Apply a power transform featurewise to make data more Gaussian-like. Power transforms are a family of parametric, monotonic transformations that are applied to make data more Gaussian-like. This is useful for modeling issues related to heteroscedasticity (non-constant variance), or other situations where normality is desired. Currently, PowerTransformer supports the Box-Cox transform and the Yeo-Johnson transform. The optimal parameter for stabilizing variance and minimizing skewness is estimated through maximum likelihood. Box-Cox requires input data to be strictly positive, while Yeo-Johnson supports both positive or negative data. By default, zero-mean, unit-variance normalization is applied to the transformed data." self.hyperparams_run = {'default': True} self.scaler = PowerTransformer() self.accept_type = 'c_t' def can_accept(self, data): return self.can_accept_c(data) def is_needed(self, data): # data = handle_data(data) # Update return True def fit(self, data): data = handle_data(data) self.scaler.fit(data['X']) def produce(self, data): output = handle_data(data) cols = list(output['X'].columns) cols = ["{}_pwrtrnsfrm".format(x) for x in cols] output['X'] = pd.DataFrame(self.scaler.transform(output['X']), columns=cols) final_output = {0: output} return final_output
def gaussian_trans_(self, random_state, X_samp, listX, distri='normal', noise=0): if False: #x = np.hstack([np.random.standard_cauchy(size=(1000, 2)), np.random.normal(size=(1000, 2))]) trans = Gaussianize(tol=1e-2, max_iter=10) trans.fit(X_samp) # Learn the parameters for the transformation for i, X_ in enumerate(listX): if X_ is None: continue y = trans.transform( X_) # Transform x to y, where y should be normal listX[i] = trans.inverse_transform(y).astype( np.float32 ) # Inverting this transform should recover the data #assert np.allclose(x_prime, x) #trans.qqplot(x,output_dir="E:/QuantumForest/dump/") # Plot qq plots for each variable, before and after. #print() else: power = PowerTransformer(method='yeo-johnson').fit(X_samp) for i, X_ in enumerate(listX): if X_ is None: continue listX[i] = power.transform(X_) return listX, power
def update_power_transform(df): pt = PowerTransformer(standardize=False) df_cont = df[["n_files", "total_mb"]] pt.fit(df_cont) input_matrix = pt.transform(df_cont) # FILES (n_files) f_mean = np.mean(input_matrix[:, 0]) f_sigma = np.std(input_matrix[:, 0]) # SIZE (total_mb) s_mean = np.mean(input_matrix[:, 1]) s_sigma = np.std(input_matrix[:, 1]) files = input_matrix[:, 0] size = input_matrix[:, 1] x_files = (files - f_mean) / f_sigma x_size = (size - s_mean) / s_sigma normalized = np.stack([x_files, x_size], axis=1) idx = df_cont.index df_norm = pd.DataFrame(normalized, index=idx, columns=["x_files", "x_size"]) df["x_files"] = df_norm["x_files"] df["x_size"] = df_norm["x_size"] lambdas = pt.lambdas_ pt_transform = { "f_lambda": lambdas[0], "s_lambda": lambdas[1], "f_mean": f_mean, "f_sigma": f_sigma, "s_mean": s_mean, "s_sigma": s_sigma, } print(pt_transform) return df, pt_transform
def fit(self, dataframe: DataFrame) -> None: """Estimate and save the optimal parameter of PowerTransformer for each feature. Also store values require to scale and unscale the features if scaling needs to be apply. :param dataframe: dataframe containing only the features that needs to be normalize """ for feature in list(dataframe): self._registered_features.append(feature) data = dataframe[feature].to_numpy().reshape(-1, 1) # load feature into a numpy array self._transformers[feature] = {} # initialized storage for the feature transformers if self.to_log(data): # log features that should be log data = np.log(data) self._log_features.append(feature) power_transformer = PowerTransformer() # log features that should be log power_transformer.fit(data) self._transformers[feature]['normalizer'] = power_transformer if self._scale: scaler = MinMaxScaler(feature_range=self._scale) scaler.fit(power_transformer.transform(data)) self._transformers[feature]['scaler'] = scaler
def gaussian_scaler(train, validate, test): ''' This function scales data using a gaussian sclaler. This uses either the Box-Cox(positive data) or Yeo-Johnson(negative and positibe data) method to transform to resemble normal or standard normal distrubtion. ''' scaler = PowerTransformer(method='yeo-johnson', standardize=False, copy=True) train[['monthly_charges', 'tenure', 'total_charges']] = scaler.fit_transform( train[['monthly_charges', 'tenure', 'total_charges']]) validate[['monthly_charges', 'tenure', 'total_charges']] = scaler.transform( validate[['monthly_charges', 'tenure', 'total_charges']]) test[['monthly_charges', 'tenure', 'total_charges']] = scaler.transform( test[['monthly_charges', 'tenure', 'total_charges']]) return scaler, train, validate, test
def yeo_johnson_transf(data): pt = PowerTransformer(method='yeo-johnson', standardize=True) pt.fit(data) lambdas = pt.lambdas_ df_yeojohnson = pd.DataFrame( pt.transform(data), columns=data.columns.values ) return df_yeojohnson, lambdas