def pd_fixskew(Data, tresh=0.5, mthd='box-cox', exclude=[], return_lambda=False): """ if data contains zero the boxcox is applied with shift of epsilon """ skew_res = Data.skew() f_cols = np.empty(shape=Data.shape) transformer = [] for i, col in enumerate(Data.columns) : if col in exclude : f_cols[:,i] = Data[col] else : array_col = np.reshape(Data[col].values, newshape=(len(Data[col]), 1)) try : trnsfm = PowerTransformer(method=mthd, standardize=True) f_col = trnsfm.fit_transform(array_col) f_cols[:,i] = np.reshape(f_col, newshape=(len(Data[col],))) transformer.append(trnsfm) except : print('WARNING : {} failed on {} passing to yeo-johnson'.format(mthd, col)) trnsfm = PowerTransformer(method='yeo-johnson', standardize=True) f_col = trnsfm.fit_transform(array_col) f_cols[:,i] = np.reshape(f_col, newshape=(len(Data[col],))) transformer.append(trnsfm) Data_skewFixed = pd.DataFrame(f_cols, index=Data.index, columns=Data.columns) if return_lambda : return Data_skewFixed, transformer else : return Data_skewFixed
def class_model(df): #Data splitting train and test Data x = df.drop('bad_loan', axis=1) y = df.bad_loan x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3) #Feature Scaling PT = PowerTransformer() x_train = PT.fit_transform(x_train) x_test = PT.fit_transform(x_test) #LogisticRegression classification Model without cross Validation log = LogisticRegression() log.fit(x_train, y_train) log_pred = log.predict(x_test) log_accuracy = metrics.accuracy_score(y_test, log_pred) print("Accuracy: ", log_accuracy) log_precision = metrics.precision_score(y_test, log_pred, pos_label=0) print("Precision: ", log_precision) log_recall = metrics.recall_score(y_test, log_pred, pos_label=0) print("Recall: ", log_recall) log_f1_score = metrics.f1_score(y_test, log_pred, pos_label=0) print("F1 Score: ", log_f1_score) print("Confusion Matrix:\n", confusion_matrix(y_test, log_pred)) print("Classification Report:\n", classification_report(y_test, log_pred)) #LogisticRegression classification Model with cross Validation PT1 = PowerTransformer() x = PT1.fit_transform(x) log_cross_val = cross_val_score(log, x, y, cv=10, scoring='accuracy') print('Classification Results with cross validation::') log_cv_accuracy = log_cross_val.mean() print("Accuracy: ", log_cv_accuracy) log_cross_val_pre = cross_val_score(log, x, y, cv=10, scoring='precision_macro') log_cv_precision = log_cross_val_pre.mean() print("Precision: ", log_cv_precision) log_cross_val_re = cross_val_score(log, x, y, cv=10, scoring='recall_macro') log_cv_recall = log_cross_val_re.mean() print("Recall: ", log_cv_recall) log_cross_val_f1 = cross_val_score(log, x, y, cv=10, scoring='f1_macro') log_cv_f1_score = log_cross_val_f1.mean() print("F1 Score: ", log_cv_f1_score)
def normalize_by_category(data): ID_dict = split_data_by_identifier(data) for key, item in zip(ID_dict.keys(), ID_dict.values()): item = item[((item.success_score - item.success_score.mean()) / item.success_score.std()).abs() < 3] pt = PowerTransformer() pt.fit_transform(item[['success_score', 'income_bracket']]) item['success_score'] = ( item.success_score - item.success_score.mean()) / item.success_score.std(ddof=0) item['success_score'] = map_transform(item['success_score'], 0, 10) ID_dict[key] = item chart = pd.concat(ID_dict.values()) return chart
class Target_Transformation(BaseEstimator, TransformerMixin): def __init__(self, target, function_to_apply='bc'): self.target = target self.function_to_apply = function_to_apply if self.function_to_apply == 'bc': self.function_to_apply = 'box-cox' else: self.function_to_apply = 'yeo-johnson' def fit(self, dataset, y=None): return None def transform(self, dataset, y=None): return dataset def fit_transform(self, dataset, y=None): data = dataset.copy() # if target column has zero or negative values then auto use yj method if any(data[self.target] <= 0): self.function_to_apply = 'yeo-johnson' self.p_transform_target = PowerTransformer( method=self.function_to_apply) data[self.target] = self.p_transform_target.fit_transform( np.array(data[self.target]).reshape(-1, 1)) return data
def to_gaussian(data, submission_feat, exclude, gauss): if gauss != "no": from sklearn.preprocessing import PowerTransformer df = data.copy() features = list( set( df.select_dtypes(include=[ "uint8", "int16", "int32", "int64", "float16", "float32", "float64" ]).columns) - set(exclude) - set([feat.upper() for feat in submission_feat])) no_action = list(set(df.columns) - set(features)) pt = PowerTransformer(method='yeo-johnson') values = pt.fit_transform(df[features]) df_gaussian = pd.DataFrame(data=values, columns=df[features].columns, index=df[features].index) df_gaussian[no_action] = df[no_action] else: df_gaussian = data.copy() pt = "No PowerTransformer applied" features = pt gc.collect() return df_gaussian, pt, features
def preprocessing(n_clicks, preprocessings): global use_df global processed_df global cat_cols global num_cols if n_clicks == 0: return html.H5('') elif target_column is None: return html.H5('先に目的変数を指定してください。') elif preprocessings is None or preprocessings == []: processed_df = use_df.copy() return html.H5('前処理は行われていません') else: text = [] processed_df = use_df.copy() if 'FN' in str(preprocessings): processed_df = processed_df.fillna(processed_df.mean()) text.append('欠損値補完') if 'YJ' in str(preprocessings): yj = PowerTransformer(method='yeo-johnson') processed_df[num_cols] = yj.fit_transform(processed_df[num_cols]) text.append('Yeo-Johnson変換') if 'SS' in str(preprocessings): ss = StandardScaler() processed_df[num_cols] = ss.fit_transform(processed_df[num_cols]) text.append('標準化') if 'OE' in str(preprocessings): oe = ce.OneHotEncoder(cols=cat_cols, handle_unknown='impute') processed_df = oe.fit_transform(processed_df) text.append('One-Hot Encoding') return html.H5('{}を行いました。'.format(text))
def plot_obs_umaps(h5ad, obs_cols, clip=True, normalize=True): """ TODO: revise the normalization! -> use vmin and vmax from scpy :param h5ad: :param obs_cols: :param clip: :param normalize: :return: """ if clip: stdsc = StandardScaler().fit(h5ad.obs[obs_cols].values) h5ad.obs[obs_cols] = h5ad.obs[obs_cols].mask(stdsc.transform(h5ad.obs[obs_cols].values) < -3) for c in obs_cols: h5ad.obs[c].fillna(-3 * h5ad.obs[c].std()) h5ad.obs[obs_cols] = h5ad.obs[obs_cols].mask(stdsc.transform(h5ad.obs[obs_cols].values) > 3) for c in obs_cols: h5ad.obs[c].fillna(3 * h5ad.obs[c].std()) if normalize: pete = PowerTransformer(method='yeo-johnson', standardize=True) h5ad.obs[obs_cols] = pete.fit_transform(h5ad.obs[obs_cols].values) figures = [] # fig = plt.figure(figsize=(4*(len(obs_cols)//4), 4*2), dpi=150) for i, c in enumerate(obs_cols): # axs = fig.add_subplot(4, len(obs_cols)/4 + 1, i+1) sc.pl.umap(h5ad, color=c, return_fig=True, cmap='RdYlBu_r') fig = plt.gcf() fig.set_size_inches(5, 4) figures.append(fig) return figures
def dataloader(self): cols_drop = [ "actual_load", ] X_train = self.train.drop(columns=cols_drop) y_train = self.train.actual_load X_test = self.test.drop(columns=cols_drop) y_test = self.test.actual_load X_val = self.val.drop(columns=cols_drop) y_val = self.val.actual_load if self.transform is not None: scaler = PowerTransformer(method="box-cox") y_train = scaler.fit_transform( np.array(self.train.actual_load).reshape(-1, 1)) y_train = y_train.ravel() y_val = scaler.transform( np.array(self.val.actual_load).reshape(-1, 1)) y_val = y_val.ravel() # Saving sklearn transformation file to be further used for inverse transformation in test.py scaler_filename = SCALER_FILENAME joblib.dump(scaler, scaler_filename) return X_train, y_train, X_val, y_val, X_test, y_test
def data_preprocessing(dataset, model): ''' # Function that pre process dataset # 1) Feature Label extraction # 2) Feature scaling # 3) Outlier detection ''' # labels and features y = dataset.iloc[:, 1] X = dataset.drop(dataset.columns[[1]], axis=1) # remove 'INSTANCE_ID' X = X.drop(dataset.columns[[0]], axis=1) # Feature Normalization scaler = PowerTransformer() if model == NB else StandardScaler() X = scaler.fit_transform(X) y = y.to_numpy() # outlier detection and removal # lowest 0.5% data removed as outlier out = LocalOutlierFactor(n_neighbors=20) out.fit_predict(X) lof = out.negative_outlier_factor_ thresh = np.quantile(lof, 0.005) index = np.where(lof > thresh) X_selected = X[index] y_selected = y[index] return X, y, X_selected, y_selected
def map_2_gaussian(X, mapping_method): '''Maps N*M data from any distribution to as close to a Gaussian distribution as possible in order to stabilize variance and minimize skewness. mapping method either 'box-cox' or 'yeo-johnson, standardize = False will apply zero-mean, unit-variance normalization to the transformed output by default.''' pt = PowerTransformer(method=mapping_method, standardize=False) data = [ pt.fit_transform(X[i].reshape(1, -1)) for i in range(0, X.shape[0]) ] return np.array(data).astype('float32')
def transform(df): """This method is used to standardize data. : param df : a pandas DataFrame with values to transform, : return : a DataFrame with standardized values. """ pt = PowerTransformer() result = pt.fit_transform(df) return result
def normalize_by_category(data, identifier_1, identifier_2, success_category, numeric_correlative): ID_dict = split_data_by_identifier(data, identifier_1, identifier_2) for key, item in zip(ID_dict.keys(), ID_dict.values()): item['success_category'] = item[success_category] item = item.drop(success_category, axis = 1) item = item[((item.success_category - item.success_category.mean()) / item.success_category.std()).abs() < 3] item['true_scores'] = item['success_category'].copy(deep=False) temp = item['success_category'] item = item.drop('success_category', axis=1) pt = PowerTransformer() pt.fit_transform(item[['true_scores', numeric_correlative]]) item['true_scores'] = (item.true_scores - item.true_scores.mean())/item.true_scores.std(ddof=0) item['success_category'] = temp.to_frame() item['true_scores'] = map_transform(item['true_scores'], 1, 10) ID_dict[key] = item chart = pd.concat(ID_dict.values()) return chart
def predXgbYJ(): xg = xgb.XGBRegressor() xg.fit(X_t, y_t) r2 = xg.score(X_t, y_t) pt3 = PowerTransformer() test_t = pt3.fit_transform(test) pred_Elec_t = xg.predict(test_t) pred_Elec = pt2.inverse_transform(pred_Elec_t.reshape(-1, 1)) return pred_Elec, r2
def predRFYJ(): rf = RandomForestRegressor(n_estimators=1400, random_state=42) rf.fit(X_t, y_t) # Train the model on training data r2 = rf.score(X_t, y_t) # Make predictions using the testing set pt3 = PowerTransformer() test_t = pt3.fit_transform(test) pred_Elec_t = rf.predict(test_t) pred_Elec = pt2.inverse_transform(pred_Elec_t.reshape(-1, 1)) return pred_Elec, r2
def predLinearYJ(): lm = linear_model.LinearRegression() lm.fit(X_t, y_t) # Train the model using the training sets r2 = lm.score(X_t, y_t) # Make predictions using the testing set pt3 = PowerTransformer() test_t = pt3.fit_transform(test) pred_Elec_t = lm.predict(test_t) pred_Elec = pt2.inverse_transform(pred_Elec_t.reshape(-1, 1)) return pred_Elec, r2
def data_transformation(self, data): scaler = StandardScaler() standard_data = pd.DataFrame(scaler.fit_transform( data), columns=data.columns, index=data.index) transformer = PowerTransformer() transformed_data = pd.DataFrame(transformer.fit_transform( standard_data), columns=data.columns, index=data.index) return scaler, transformer, transformed_data
def apply_yeojohnson(df): feature = pd.DataFrame(df) name = feature.columns print(name) pt = PowerTransformer( method='yeo-johnson', standardize=True, ) tr_yeo = pt.fit_transform(feature) return pd.DataFrame(tr_yeo, columns=name)
def PowerScale(self, df, target): sc = PowerTransformer() x = df.drop(target, axis=1) scaled_features = sc.fit_transform(x) scaled_features_df = pd.DataFrame(scaled_features, index=x.index, columns=x.columns) scaled_features_df[target] = df[target] return scaled_features_df, "PowerTransformer()"
def _get_normalized_input_data(device: str, start_date: date, end_date: date) -> pd.DataFrame: data = _get_input_data(device, start_date, end_date) if data.empty: return data pt = PowerTransformer() normalized_data = pt.fit_transform(data) normalized_data = pd.DataFrame(normalized_data, columns=data.columns, index=data.index) return normalized_data
def power_transform(df): for column in df.select_dtypes(include=['int', 'float']).columns: if column == 'TARGET' or column.startswith('SK_ID'): continue if column == 'AMT_INCOME_TOTAL': encoder = PowerTransformer(method='box-cox') else: encoder = PowerTransformer(method='yeo-johnson') df[column] = encoder.fit_transform(df[[column]]) df[column] = df[column].astype('float32') return df
def box_cox_transform(df, include_missing_value=False): num_cols = utl.get_numerical_columns(df) if include_missing_value: pos_cols = [c for c in num_cols if ~(df[c] <= 0.0).all()] else: pos_cols = [c for c in num_cols if (df[c] > 0.0).all()] pt = PowerTransformer(method='box-cox') df[pos_cols] = pt.fit_transform(df[pos_cols]) return df
def transform(self, X): X = pd.DataFrame(X, columns=self.column_names) if (self.strategy == "scaler"): scaler = MinMaxScaler() X[self.numerical_cols] = scaler.fit_transform( X[self.numerical_cols]) elif (self.strategy == "transformer"): transformer = PowerTransformer(method='yeo-johnson') X[self.numerical_cols] = transformer.fit_transform( X[self.numerical_cols]) return X
def power_scaler(train, test): ''' Apply a power transform featurewise to make data more Gaussian-like. ''' scaler = PowerTransformer(method='yeo-johnson') train = scaler.fit_transform(train) test = scaler.transform(test) return train, test
def model_main_classifier(C=0.05): ts_code = '399300.SZ' x_train, x_test, y_train, y_test = getdata(ts_code, type='classifier', startDate='20090101') transer = PowerTransformer(method='yeo-johnson') # print(x_train) x_train = transer.fit_transform(x_train) # print('-' * 80) # print(x_train) # print('=' * 80) # print(x_test) x_test = transer.transform(x_test) # print('-' * 80) # print(x_test) # return # print(x_train.shape) # print(x_train) # print(x_test.shape) # print(y_train.shape) # print(y_train) # print(y_test.shape) # 线性回归 # model = LinearRegression() # 线性支持向量机 linearSVC model = LinearSVC(C=C) model.fit(x_train, y_train) # y_predictions = model.predict(x_test) # r2 = r2_score(y_test, y_predictions) # print('intercept:', model.intercept_) # print('coef:', model.coef_) # print('y_test:\n', y_test) # print('y_predictions:\n', y_predictions) print('linearSVC score:', model.score(x_test, y_test)) # print('r2:', r2) # SVC model = SVC(kernel='linear', cache_size=1000) model.fit(x_train, y_train) print('SVC score:', model.score(x_test, y_test)) # predictions = model.predict(x_test) predictions = model.predict(x_train) # print('y_test:', y_test) # print('y_predict:', predictions) # con = confusion_matrix(y_test, predictions) con = confusion_matrix(y_train, predictions) # con = confusion_matrix(y_test, predictions, labels=['up', 'down']) print(con) print(f'真实值中为True的次数: {y_train.sum()}') print(f'预测值中为True的次数: {predictions.sum()}') print(f'精度(precision):{precision_score(y_train, predictions)}')
def transform_amplitude(inputfile, scale=True): amplitudes = np.fromfile(inputfile, dtype=np.float) n_samples = amplitudes.shape[0] amplitudes = amplitudes.reshape((n_samples, -1)) bc = PowerTransformer(method='box-cox') yj = PowerTransformer(method='yeo-johnson') qt = QuantileTransformer(n_quantiles=n_samples, output_distribution='normal') min_max_scaler = MinMaxScaler() bc_amplitudes = bc.fit_transform(amplitudes) yj_amplitudes = yj.fit_transform(amplitudes) qt_amplitudes = qt.fit_transform(amplitudes) if scale: bc_amplitudes = min_max_scaler.fit_transform(bc_amplitudes) yj_amplitudes = min_max_scaler.fit_transform(yj_amplitudes) qt_amplitudes = min_max_scaler.fit_transform(qt_amplitudes) return amplitudes, bc_amplitudes, yj_amplitudes, qt_amplitudes
def transform4mancova(df, save_dir, file_name, ncolumns = None): ''' :param df: manovav_df - or any data frames that you want to transform to a normal distribution. :param ncolumns: Which columns to normalize :return: ''' pt = PowerTransformer(method='yeo-johnson', standardize=False) if ncolumns == None: ncolumns = df.columns[df.columns.str.contains('NARS') | df.columns.str.contains('BFI') | df.columns.str.contains('GODSPEED')] df[ncolumns] = pd.DataFrame(data = pt.fit_transform(df[ncolumns]), columns = ncolumns) df.to_csv(save_dir + 'normalized_' + file_name + '.csv')
def normalize_features(dataset): num = num_features(dataset) pt = PowerTransformer() num = pd.DataFrame(data = pt.fit_transform(num),columns = list(num)) print('plot after transformation with lambda :',pt.lambdas_) for i in list(num): plt.figure(figsize = (5,5)) sns.distplot(num[i]) plt.show() print('Skewness :',num[i].skew()) print('kurtosis :',num[i].kurtosis()) return num
def transform_X_Power(X_train, column): # fit on training data column Power = PowerTransformer(method='yeo-johnson', standardize=True) # transform the training & Test data column X_train_Power = Power.fit_transform(X_train) X_train_Power = pd.DataFrame( X_train_Power, columns = column) return X_train_Power
def power_transform(logged=None): import matplotlib.pyplot as plt import numpy as np from sklearn.preprocessing import StandardScaler, PowerTransformer if logged == None: return print( "Error: must have list-like logged argument that contains 6 elements." ) power = PowerTransformer() power_price = power.fit_transform(np.array(logged[0]).reshape(-1, 1)) power_sqft_living = power.fit_transform(np.array(logged[1]).reshape(-1, 1)) power_sqft_lot = power.fit_transform(np.array(logged[2]).reshape(-1, 1)) power_sqft_living15 = power.fit_transform( np.array(logged[3]).reshape(-1, 1)) power_sqft_lot15 = power.fit_transform(np.array(logged[4]).reshape(-1, 1)) power_yard_size = power.fit_transform(np.array(logged[5]).reshape(-1, 1)) plt.hist(power_price, bins='auto', color='r', alpha=.7) plt.hist(power_sqft_living, bins='auto', color='b', alpha=.7) plt.hist(power_sqft_lot, bins='auto', color='g', alpha=.7) plt.hist(power_sqft_living15, bins='auto', color='pink', alpha=.7) plt.hist(power_sqft_lot15, bins='auto', color='y', alpha=.7) plt.title('Power Transformed Variables (Centered around Zero)') powered_vars = [ power_price, power_sqft_living, power_sqft_living15, power_sqft_lot, power_sqft_lot15, power_yard_size ] return powered_vars, plt.show()
def transform(self, X, **transform_params): notify.entering(__class__.__name__, "transform") # Impute missing values as linear function of other features imputer = IterativeImputer() X[self._continuous] = imputer.fit_transform(X[self._continuous]) # Power transformation to make feature distributions closer to Guassian power = PowerTransformer(method="yeo-johnson", standardize=False) X[self._continuous] = power.fit_transform(X[self._continuous]) notify.leaving(__class__.__name__, "transform") return X
colors = ['firebrick', 'darkorange', 'goldenrod', 'seagreen', 'royalblue', 'darkorchid'] fig, axes = plt.subplots(nrows=4, ncols=3) axes = axes.flatten() axes_idxs = [(0, 3), (1, 4), (2, 5), (6, 9), (7, 10), (8, 11)] axes_list = [(axes[i], axes[j]) for i, j in axes_idxs] for distribution, color, axes in zip(distributions, colors, axes_list): name, X = distribution # scale all distributions to the range [0, 10] X = minmax_scale(X, feature_range=(1e-10, 10)) # perform power transform X_trans = pt.fit_transform(X) lmbda = round(pt.lambdas_[0], 2) ax_original, ax_trans = axes ax_original.hist(X, color=color, bins=BINS) ax_original.set_title(name, fontsize=FONT_SIZE) ax_original.tick_params(axis='both', which='major', labelsize=FONT_SIZE) ax_trans.hist(X_trans, color=color, bins=BINS) ax_trans.set_title('{} after Box-Cox, $\lambda$ = {}'.format(name, lmbda), fontsize=FONT_SIZE) ax_trans.tick_params(axis='both', which='major', labelsize=FONT_SIZE) plt.tight_layout()