def robust_scaler(data, center=True, reduce=True, q_range=(25.0, 75.0), return_robust_scaler=False, rebuild_df=False): """ Scale features using statistics that are robust to outliers. This Scaler removes the median and scales the data according to the quantile range (defaults to IQR: Interquartile Range). :param data: unscaled data (numpy array or dataframe) :param center: center unscaled data (mean = 0) :param reduce: reduce unscaled data (standard deviation = 1) :param return_robust_scaler: boolean value which enable returning (or not) RobustScaler instance :param rebuild_df: boolean value which enable rebuilding original dataframe with scaled data :return: scaled data (numpy array or dataframe), RobustScaler instance (optional) """ rbt_scaler = RobustScaler(with_centering=center, with_scaling=reduce, quantile_range=q_range) x_scaled = rbt_scaler.fit_transform(data) if return_robust_scaler is True and rebuild_df is True: df_scaled = pd.DataFrame(x_scaled, columns=data.columns, index=data.index) return df_scaled, rbt_scaler elif return_robust_scaler: return x_scaled, rbt_scaler elif rebuild_df: return pd.DataFrame(x_scaled, columns=data.columns, index=data.index) return x_scaled
def min_max_scaler(data, return_min_max_scaler=False, rebuild_df=False): """ Transform features by scaling each feature to a given range The transformation is given by: X_std = (X - X.min(axis=0)) / (X.max(axis=0) - X.min(axis=0)) X_scaled = X_std * (max - min) + min :param data: unscaled data (numpy array or dataframe) :param return_min_max_scaler: boolean value which enable returning (or not) MinMaxScaler instance :param rebuild_df: boolean value which enable rebuilding original dataframe with scaled data :return: scaled data (numpy array or dataframe), MinMaxScaler instance (optional) """ m_scaler = MinMaxScaler() x_scaled = m_scaler.fit_transform(data) if return_min_max_scaler is True and rebuild_df is True: df_scaled = pd.DataFrame(x_scaled, columns=data.columns, index=data.index) return df_scaled, m_scaler elif return_min_max_scaler: return x_scaled, m_scaler elif rebuild_df: return pd.DataFrame(x_scaled, columns=data.columns, index=data.index) return x_scaled
def standard_scaler(data, center=True, reduce=True, return_std_scaler=False, rebuild_df=False): """ Standardize features by removing the mean and scaling to unit variance (z = (x - u) / s) :param data: unscaled data (numpy array or dataframe) :param center: center unscaled data (mean = 0) :param reduce: reduce unscaled data (standard deviation = 1) :param return_std_scaler: boolean value which enable returning (or not) StandardScaler instance :param rebuild_df: boolean value which enable rebuilding original dataframe with scaled data :return: scaled data (numpy array or dataframe), StandardScaler instance (optional) """ std_scaler = StandardScaler(with_mean=center, with_std=reduce) x_scaled = std_scaler.fit_transform(data) if return_std_scaler is True and rebuild_df is True: df_scaled = pd.DataFrame(x_scaled, columns=data.columns, index=data.index) return df_scaled, std_scaler elif return_std_scaler: return x_scaled, std_scaler elif rebuild_df: return pd.DataFrame(x_scaled, columns=data.columns, index=data.index) return x_scaled
def get_features_importance(labels, coefs, abs_coefs=False, non_zero_coefs=False, sort=True, verbose=False): """ """ # Build feature importance dataframe fimp_df = pd.DataFrame({'feature': labels, 'coefficient': coefs}) # Get positive coefficients if abs_coefs: fimp_df['coefficient'] = np.abs(fimp_df['coefficient']) # Filter zero coefficients if non_zero_coefs: fimp_df = fimp_df[fimp_df['coefficient'] != 0] # Sort features (get most important features at the head) if sort: fimp_df = fimp_df.sort_values('coefficient', ascending=False).reset_index(drop=True) # Print selected features and reduction ratio if verbose: # Get features count total_features = len(labels) # Get filtered features count (zero coefficients removed) selected_features = fimp_df.shape[0] reduction_ratio = (1 - (selected_features/total_features))*100 print('{0}/{1} features selected, reduction of {2:.1f}%'.format(selected_features, total_features, reduction_ratio)) # Compute coefficient frequency in order to calculate cumulative feature importance coefficients_sum = fimp_df['coefficient'].sum() fimp_df['coefficient_frequency'] = fimp_df['coefficient'] / coefficients_sum fimp_df['cumulative_coefficient_frequency'] = np.cumsum(fimp_df['coefficient_frequency']) return fimp_df
def reverse_standardization(data_scaled, scaler, rebuild_df=False): """ Inverse standardized features transformation :param x_scaled: scaled data (numpy array) :param scaler: StandardScaler or FunctionTransformer instance :param rebuild_df: boolean value which enable rebuilding original dataframe with unscaled data :return: unscaled data """ x_unscaled = scaler.inverse_transform(data_scaled) if rebuild_df: df_unscaled = pd.DataFrame(x_unscaled, columns=data_scaled.columns, index=data_scaled.index) return df_unscaled return x_unscaled