コード例 #1
0
def normalise_data(data_list): 
    X = PowerTransformer(method='box-cox').fit_transform(data_list)
    X[:,0] = X[:,0]*2.0
    X[:,1] = X[:,1]*2.0
    X[:,2] = X[:,2]*1.6
    
    symmetry = X[:, 0]
    border = X[:, 1]
    colour = X[:, 2]
    
    list_symmetry = []
    list_border = []
    list_colour = []
    
    #CONVERT NUMPY LIST TO LISTS PER FEATURE
    for i in range(len(X)):
        list_symmetry.append(symmetry[i])
        list_border.append(border[i])
        list_colour.append(colour[i])

    return list_symmetry, list_border, list_colour
コード例 #2
0
def get_preprocessed_data(pre_process_choice, data_to_change):
    if pre_process_choice == "none":
        process_data = data_to_change
    elif pre_process_choice == "normalise":
        process_data = normalize(data_to_change)
    elif pre_process_choice == "standardscaler":
        process_data = StandardScaler().fit_transform(data_to_change)
    elif pre_process_choice == "minmaxscaler":
        process_data = MinMaxScaler().fit_transform(data_to_change)
    elif pre_process_choice == "powertransformer":
        process_data = PowerTransformer(
            method='yeo-johnson').fit_transform(data_to_change)
    elif pre_process_choice == "quantiletransformer":
        process_data = QuantileTransformer(
            output_distribution='uniform').fit_transform(data_to_change)
    elif pre_process_choice == "x_pca_reduced":  #This part is only relevant to Q4 and Q5
        X = MinMaxScaler().fit_transform(
            data_to_change)  #Please see Q4 below for explanation
        process_data = PCA(n_components=121).fit_transform(X)

    return process_data
コード例 #3
0
ファイル: ts_utils.py プロジェクト: Seirdy/clogstats
def make_forecasts_ensure_positive(
    train: TimeSeries,
    n_pred: int,
    predictions_to_make: ModelsToMake,
) -> Dict[str, TimeSeries]:
    """Wrap transfomations around make_transform() to ensure positive forecasts.

    Avoids negative values in forecasts by adding 1 to all values
    (ensuring no zeros) and applying a Box-Cox transformation/inversion
    before/after forecasting.
    """
    scaler_wrapper = ScalerWrapper(  # type: ignore
        scaler=PowerTransformer(method="box-cox"), )
    # box-cox doesn't like the number "0", so add one to everything
    scaled_train = scaler_wrapper.fit_transform(train + 1)
    forecasts = make_forecasts(scaled_train, n_pred, predictions_to_make)
    # invert the transformation
    return {
        name: scaler_wrapper.inverse_transform(forecast) - 1
        for (name, forecast) in forecasts.items()
    }
コード例 #4
0
def load_scaler(use_scaler=None):
    """a"""
    if use_scaler:
        method = use_scaler
    else:
        method = config["adu"]["feature_selection"]["scaling_method"]
    if method == "Robust":
        scaler = RobustScaler()
    elif method == "Power":
        scaler = PowerTransformer()
    elif method == "MinMax":
        scaler = MinMaxScaler()
    elif method == "Standard":
        scaler = StandardScaler()
    elif method == "QuantileUniform":
        scaler = QuantileTransformer(output_distribution="uniform")
    elif method == "QuantileGaussian":
        scaler = QuantileTransformer(output_distribution="normal")
    else:
        exit(1)
    return scaler
コード例 #5
0
ファイル: column_coder.py プロジェクト: sycomix/NeMo
 def __init__(
     self,
     col_name: str,
     code_len: int,
     start_id: int,
     fillall: bool = True,
     base: int = 100,
     hasnan: bool = True,
     transform: str = 'quantile',
 ):
     super().__init__(col_name, code_len, start_id, fillall, base, hasnan)
     if transform == 'yeo-johnson':
         self.scaler = PowerTransformer(standardize=True)
     elif transform == 'quantile':
         self.scaler = QuantileTransformer(output_distribution='uniform')
     elif transform == 'robust':
         self.scaler = RobustScaler()
     else:
         raise ValueError(
             'Supported data transformations are "yeo-johnson", "quantile", and "robust"'
         )
コード例 #6
0
ファイル: doPCA.py プロジェクト: m-meidani/ECE611-Replication
def doPCA(data,
          featureColumns,
          targetColumn,
          applyTransfer=False,
          outDF=False,
          analyzePCA=False,
          standardize=False):
    dataSet = DataSet()
    # Remove target column
    df = data.iloc[:, :12]
    dataSet.target = data[targetColumn].to_numpy()
    dataSet.data = df
    if applyTransfer:
        df = PowerTransformer(standardize=False).fit_transform(df)
    if standardize:
        df = StandardScaler().fit_transform(df)

    df = np.log(df + 1)
    pca = PCA(n_components=0.95, svd_solver='full')
    pca.fit(df)
    principalComponents = pca.transform(df)

    if analyzePCA:
        pca = PCA().fit(dataSet.data)
        print(np.cumsum(pca.explained_variance_ratio_))
        plt.plot(np.cumsum(pca.explained_variance_ratio_))
        plt.xlabel('number of components')
        plt.ylabel('cumulative explained variance')
        plt.show()

    if outDF:
        principalDf = pd.DataFrame(data=principalComponents)
        dataSet.components = principalDf
    else:
        dataSet.components = principalComponents

    # print(dataSet.data.shape)
    # print(len(dataSet.target)

    return dataSet, pca
コード例 #7
0
def transform_data(df, ev, tsne=False):
    '''
    Apply PowerTransformer(), PCA(), and optionally TSNE() sequentially on dataframe

    Args:
    df (pd.DataFrame): subject dataframe
    ev (int, float, None, str): explained variance correspond to `n_components`
        parameter in PCA() class and hence inherits its arguments
    tsne (bool) [default=False]: When True, apply TSNE() on dataframe

    Return:
    X (array): transformed dataframe
    '''

    X = PCA(ev, random_state=42).fit_transform(
        PowerTransformer().fit_transform(df))

    if tsne == True:
        perplexity = int(X.shape[0]**0.5)
        X = TSNE(perplexity=perplexity, random_state=42).fit_transform(X)

    return X
コード例 #8
0
ファイル: dataset.py プロジェクト: geraldwal/class_notebooks
 def ensure_normality(self,
                      features_of_type='numerical',
                      return_series=False):
     """
     Ensures that the numerical features in the dataset, unless the
     parameter 'what' specifies any other subset selection primitive,
     fit into a normal distribution by applying the Yeo-Johnson transform
     :param features_of_type: Subset selection primitive
     :param return_series: Return the normalized series
     :return: the subset fitted to normal distribution.
     """
     assert features_of_type in self.meta_tags
     subset = self.select(features_of_type)
     mapper = DataFrameMapper([(subset.columns,
                                PowerTransformer(method='yeo-johnson',
                                                 standardize=False))])
     normed_features = mapper.fit_transform(subset.copy())
     self.features[self.names(features_of_type)] = pd.DataFrame(
         normed_features, index=subset.index, columns=subset.columns)
     self.metainfo()
     if return_series is True:
         return self.features[self.names(features_of_type)]
コード例 #9
0
    def PreProcessing(self, scale_cols, one_hot_cols):  #, ordinal_cols):
        """Establishes preprocessing and a pipeline for modeling

        Parameters
        ----------
        scale_cols : list
            A list of numerical columns to be scaled
        one_hot_cols : list
            A list of categorical columns that will be one hot encoded        

        """

        # set transformer methods
        ohe = OneHotEncoder(handle_unknown='ignore', sparse=True)
        label_encode = LabelEncoder()
        imputer = SimpleImputer(add_indicator=True, verbose=1)
        scaler = PowerTransformer()

        # Make Transformer
        self.preprocessing = make_column_transformer(
            (ohe, one_hot_cols), (make_pipeline(imputer, scaler), scale_cols),
            remainder='drop')
コード例 #10
0
def normalize_data(data, scaler):
    """
    Data normalization by using a chosen scaler
    :param data: input data frame
    :param scaler: string for chosen scaler
    :return: scaled data frame
    """

    # Switch between different scalers
    # Helpful source: https://scikit-learn.org/stable/auto_examples/preprocessing/plot_all_scaling.html
    switcher = {
        "max_abs": MaxAbsScaler(),
        "standard": StandardScaler(),
        "min_max": MinMaxScaler(),
        "robust_scaler": RobustScaler(),
        "quantile_transformer": QuantileTransformer(output_distribution='normal'),
        "power_transform": PowerTransformer(method='yeo-johnson')
    }

    scaler = switcher.get(scaler, StandardScaler())

    return scaler.fit_transform(data), scaler
コード例 #11
0
def normalize_df(df, numeric_data):
    from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler, QuantileTransformer, PowerTransformer
    norm_method = st.sidebar.selectbox('Choose normalization method',
                                       ('None', 'StandardScaler', 'MinMaxScaler', 'RobustScaler',
                                        'QuantileTransformer', 'PowerTransformer'))
    if norm_method == 'None':
        return df[numeric_data]
    else:
        if norm_method == 'StandardScaler':
            scaler = StandardScaler()
        elif norm_method == 'MinMaxScaler':
            scaler = MinMaxScaler()
        elif norm_method == 'RobustScaler':
            scaler = RobustScaler()
        elif norm_method == 'QuantileTransformer':
            scaler = QuantileTransformer()
        elif norm_method == 'PowerTransformer':
            scaler = PowerTransformer()

        df_scaled = scaler.fit_transform(df[numeric_data])
        st.success('Done!')
        return df_scaled
コード例 #12
0
ファイル: transform.py プロジェクト: wuzunzun/XenonPy
    def __init__(self,
                 *,
                 method='yeo-johnson',
                 standardize=False,
                 lmd=None,
                 tolerance=(-np.inf, np.inf),
                 on_err=None):
        """

        Parameters
        ----------
        method: 'yeo-johnson' or 'box-cox'
            ‘yeo-johnson’ works with positive and negative values
            ‘box-cox’ only works with strictly positive values
        standardize: boolean
            Normalize to standard normal or not.
            Recommend using a sepearate `standard` function instead of using this option.
        lmd: list or 1-dim ndarray
            You might assign each input xs with a specific lmd yourself.
            Leave None(default) to use a inferred value.
            See `PowerTransformer` for detials.
        tolerance: tuple
            Tolerance of lmd. Set None to accept any.
            Default is **(-np.inf, np.inf)** but recommend **(-2, 2)** for Box-cox transform
        on_err: None or str
            Error handle when try to inference lambda. Can be None or **log**, **nan** or **raise** by string.
            **log** will return the logarithmic transform of xs that have a min shift to 1.
            **nan** return ``ndarray`` with shape xs.shape filled with``np.nan``.
            **raise** raise a FloatingPointError. You can catch it yourself.
            Default(None) will return the input series without scale transform.
        .. _PowerTransformer:
            https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.PowerTransformer.html#sklearn.preprocessing.PowerTransformer
        """
        self._tolerance = tolerance
        self._pt = PT(method=method, standardize=standardize)
        self._lmd = lmd
        self._shape = None
        self._on_err = on_err
コード例 #13
0
    def infer(self):

        train_pred = self.model.predict((self.X_train))
        val_pred = self.model.predict((self.X_val))
        test_pred = self.model.predict((self.X_test))
        print(
            "-----------------------------------------------------------------"
        )
        print("Training results", "\n")
        if self.transform is not None:
            scaler = PowerTransformer(method="box-cox")
            scaler.fit(np.array(self.train.actual_load).reshape(-1, 1))
            inv_train_pred = scaler.inverse_transform(
                np.array(train_pred).reshape(-1, 1))
            inv_val_pred = scaler.inverse_transform(
                np.array(val_pred).reshape(-1, 1))
            inv_test_pred = scaler.inverse_transform(
                np.array(test_pred).reshape(-1, 1))
            print(
                "Training error: ",
                mse(self.train.actual_load, inv_train_pred, squared=False),
            )
            print(
                "Validation error: ",
                mse(self.val.actual_load, inv_val_pred, squared=False),
            )
            print("Test error: ", mse(self.y_test,
                                      inv_test_pred,
                                      squared=False))
            print(
                "Note : The error printed above is calculated after the inverse transform of box-cox"
            )

        else:
            print("Training error: ",
                  mse(self.y_train, train_pred, squared=False))
            print("Validation error: ", mse(self.y_val,
                                            val_pred,
                                            squared=False))
            print("Test error: ", mse(self.y_test, test_pred, squared=False))
コード例 #14
0
ファイル: logger.py プロジェクト: LarsHanegraaf/fclearn
    def update(self, event, instance):
        """Observer method that adds a row for every iteration to the DF."""
        if event == Events.OPTIMIZATION_STEP:
            result = instance.res[-1]
            row = {
                "mean_test_rmse":
                result["target"],
                "param_regression__regressor__max_depth":
                int(result["params"]["max_depth"]),
                "param_regression__regressor__n_estimators":
                int(result["params"]["n_estimators"]),
                "param_regression__transformer":
                "None" if result["params"]["transformer"] < 0.5 else
                PowerTransformer(),
            }
            self.data = self.data.append(row, ignore_index=True)
        if event == Events.OPTIMIZATION_END:
            self.data["rank_test_rmse"] = self.data.sort_values(
                "mean_test_rmse", ascending=False)[[
                    "mean_test_rmse"
                ]].apply(lambda x: pd.Series(np.arange(len(x)) + 1, x.index))

        self._update_tracker(event, instance)
コード例 #15
0
def power_normalization(
    data: np.ndarray,
    return_scaler: bool = False,
    scaler: Optional[object] = None
) -> Union[np.ndarray, Tuple[np.ndarray, object]]:
    """Normalizes provided data via power normalization.
    More: https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.PowerTransformer.html
    Uses sklearn.preprocessing.PowerTransformer() object.
    :param data: np.ndarray
                data in 2d-format (n_samples, n_features) or 1d-format (n_samples,)
    :param return_scaler: bool
                Should function return fitted scaler or not.
    :param scaler: object (sklearn.preprocessing.StandardScaler)
                If not None, the provided scaler will be used as normalizer.
    :return: np.ndarray or (np.ndarray, scaler)
                If return_scaler==False, returns normalized data
                else returns normalized data and fitted scaler
    """
    # check if data is in appropriate format
    if len(data.shape) > 2:
        raise AttributeError(
            'The supplied data should be 1- or 2-dimensional. Got %i.' %
            (len(data.shape)))
    # if data is 1-dimensional, it should be converted into 2-dimensional by adding additional dimension
    if len(data.shape) == 1:
        data = data[..., np.newaxis]

    # if no scaler supplied, create scaler and fit it
    if scaler is None:
        scaler = PowerTransformer()
        scaler.fit(data)
    # transform data
    data = scaler.transform(data)
    # return scaler if need
    if return_scaler:
        return data, scaler
    return data
コード例 #16
0
ファイル: MLPreprocessing.py プロジェクト: WellJoea/MLkit
    def Standard_(self, dfa, scale = 'S'):
        Scalers = {
            'S' : StandardScaler(),
            'R' : RobustScaler(quantile_range=tuple(self.arg.QuantileRange)),
            'M' : MinMaxScaler(),
            'MA': MaxAbsScaler(),
            'OE': OrdinalEncoder(),
            'OH': OneHotEncoder(),
            'NL' : Normalizer(),
            'QT': QuantileTransformer(),
            'PT': PowerTransformer(),
            'N' : FunctionTransformer( validate=False ),
        }
        Sca_map = [Scalers[i] for i in scale]
        Xa = list( dfa.columns )

        mapper = DataFrameMapper([ ( Xa, Sca_map ) ])
        clfit = mapper.fit( dfa )

        self.log.CIF('Standardization Pocessing'.center(45, '-'))
        self.log.NIF('Scale paramaters:\n%s' %clfit)
        self.log.CIF(45 * '-')

        return clfit
コード例 #17
0
def transform_data(X, do_diff=True, do_power_transform=True):
    def diff(X, y=None):
        return np.diff(X, axis=0)

    def diff2(X, y=None):
        return diff(diff(X))

    first_diff = ('first_difference',
                  FunctionTransformer(func=diff, validate=True))
    power_transform = ('power_transform',
                       PowerTransformer(method='yeo-johnson',
                                        standardize=True))
    pipeline = []
    if do_diff:
        pipeline.append(first_diff)
    if do_power_transform:
        pipeline.append(power_transform)

    if len(pipeline) == 0:
        Z = X
    else:
        pipeline = Pipeline(pipeline)
        Z = pipeline.fit_transform(X)
    return Z
def power_transform_feature(data, col):
    df = data.orderBy("Date").toPandas()
    df['pwr_tf'] = df[col]

    sc = MinMaxScaler(feature_range=(1, 2))
    pt = PowerTransformer(method='box-cox')
    pipeline = Pipeline(steps=[('s', sc), ('p', pt)])
    df[['pwr_tf']] = pipeline.fit_transform(df[['pwr_tf']])

    fig, axs = plt.subplots(2, 2, figsize=(10, 7))
    fig.suptitle(col)

    axs[0, 0].plot(df["Date"], df[col])
    axs[0, 0].set(xlabel='Date', ylabel='Num Cases')
    axs[0, 0].set_title('Daily Cases')

    axs[0, 1].hist(df[col])
    axs[0, 1].set(xlabel='Num Cases', ylabel='Freq')
    axs[0, 1].set_title('Histogram Daily Cases')

    axs[1, 0].plot(df["Date"], df['pwr_tf'])
    axs[1, 0].set(xlabel='Date', ylabel='Num Cases')
    axs[1, 0].set_title('After Power Transform')

    axs[1, 1].hist(df['pwr_tf'])
    axs[1, 1].set(xlabel='Num Cases', ylabel='Freq')
    axs[1, 1].set_title('Histogram After PT')

    for ax in axs.flat:
        ax.set(xlabel='Date', ylabel='Num Cases')

    # Hide x labels and tick labels for top plots and y ticks for right plots.
    for ax in axs.flat:
        ax.label_outer()

    plt.tight_layout()
コード例 #19
0
def transform_amplitude(inputfile, scale=True):
    amplitudes = np.fromfile(inputfile, dtype=np.float)
    n_samples = amplitudes.shape[0]
    amplitudes = amplitudes.reshape((n_samples, -1))

    bc = PowerTransformer(method='box-cox')
    yj = PowerTransformer(method='yeo-johnson')
    qt = QuantileTransformer(n_quantiles=n_samples,
                             output_distribution='normal')
    min_max_scaler = MinMaxScaler()

    bc_amplitudes = bc.fit_transform(amplitudes)
    yj_amplitudes = yj.fit_transform(amplitudes)
    qt_amplitudes = qt.fit_transform(amplitudes)

    if scale:
        bc_amplitudes = min_max_scaler.fit_transform(bc_amplitudes)
        yj_amplitudes = min_max_scaler.fit_transform(yj_amplitudes)
        qt_amplitudes = min_max_scaler.fit_transform(qt_amplitudes)

    return amplitudes, bc_amplitudes, yj_amplitudes, qt_amplitudes
コード例 #20
0
def scaling_data(df, sismic_data_labeled, geo_data_labeled):
    # scaler = StandardScaler()
    scaler = PowerTransformer()
    sismic_data_scaled = scaler.fit_transform(sismic_data_labeled)
    scaler = PowerTransformer()
    # scaler = StandardScaler()
    geo_data_scaled = scaler.fit_transform(geo_data_labeled)
    data_scaled = np.concatenate((sismic_data_scaled, geo_data_scaled), axis=-1)

    # Determine index for Arenito samples and assing weigths
    weight = np.mean(data_scaled.max(axis=0) - data_scaled.min(axis=0)) / 2
    isarenito = []
    for i in range(len(df['Geology'])):
        if 'A' in str(df['Geology'].iloc[i]):
            isarenito.append(weight)
        else:
            isarenito.append(-weight)
    isarenito = np.array(isarenito)
    isarenito = np.expand_dims(isarenito, axis=-1)
    data2evaluate = np.concatenate((data_scaled, isarenito), axis=-1)
    df_scaled = pd.DataFrame(data2evaluate, columns=['DT', 'GR', 'NPHI', 'RHOB', 'Permeabilidade', 'Porosidade', 'RQI', 'FZI', 'isArenito'])
    return df_scaled
コード例 #21
0
from sklearn.preprocessing import StandardScaler, Normalizer, QuantileTransformer, PowerTransformer, OneHotEncoder, FunctionTransformer
from collections import Counter
import numpy as np
import streamlit as st 

transformer = dict({'Standard Scaler': StandardScaler(), 
'Normalizer (Unit Norm)': Normalizer(),
'Quantile-Transformer':QuantileTransformer(output_distribution='normal'),
'Box-Cox':PowerTransformer(method='box-cox'), 
'Yeo-Johnson':PowerTransformer(),
'Custom': 'PlaceHolder'}) # add a new transformer of choice

def tform(df, key, func):
    if func:
        return FunctionTransformer(func).fit_transform(df)
    else:
        return transformer[key].fit_transform(df)

def topk_encoder(X, k=10):
    counts = Counter(X)
    most_common = dict(counts.most_common(k))
    topk = list(most_common.keys())
    return OneHotEncoder(categories=[topk], handle_unknown='ignore')

コード例 #22
0
plt.ylim([0, 0.05])
plt.xlabel(r'Original NO$_2$ data ($\mu g/m^3$)')
plt.ylabel(r'Normalized frequency')
plt.tight_layout()
plt.savefig("data_no2_hist.pdf", format='pdf')

plt.figure(figsize=(5, 4))
plt.hist(pm10[0], bins=50, normed=True, color='gray')
plt.xlim([0, 300])
plt.ylim([0, 0.04])
plt.xlabel(r'Original PM$_{10}$ data ($\mu g/m^3$)')
plt.ylabel(r'Normalized frequency')
plt.tight_layout()
plt.savefig("data_pm10_hist.pdf", format='pdf')

pt_no2 = PowerTransformer()
pt_no2.fit(no2[0])

plt.figure(figsize=(5, 4))
plt.hist(pt_no2.transform(no2[0][no2[0] != 0].reshape(-1, 1)),
         bins=50,
         normed=True,
         color='gray')
plt.xlim([-5, 5])
plt.ylim([0, 0.5])
plt.xlabel(r'Power transformed NO$_2$ data')
plt.ylabel(r'Normalized frequency')
plt.tight_layout()
plt.savefig("data_no2_hist_trans.pdf", format='pdf')

pt_pm10 = PowerTransformer()
コード例 #23
0
# License: BSD 3 clause

import numpy as np
import matplotlib.pyplot as plt

from sklearn.preprocessing import PowerTransformer, minmax_scale

print(__doc__)


N_SAMPLES = 3000
FONT_SIZE = 6
BINS = 100


pt = PowerTransformer(method='box-cox', standardize=False)
rng = np.random.RandomState(304)
size = (N_SAMPLES, 1)


# lognormal distribution
X_lognormal = rng.lognormal(size=size)

# chi-squared distribution
df = 3
X_chisq = rng.chisquare(df=df, size=size)

# weibull distribution
a = 50
X_weibull = rng.weibull(a=a, size=size)
コード例 #24
0
# Take only 2 features to make visualization easier
# Feature of 0 has a long tail distribution.
# Feature 5 has a few but very large outliers.

X = X_full[:, [0, 5]]

distributions = [
    ('Unscaled data', X),
    ('Data after standard scaling', StandardScaler().fit_transform(X)),
    ('Data after min-max scaling', MinMaxScaler().fit_transform(X)),
    ('Data after max-abs scaling', MaxAbsScaler().fit_transform(X)),
    ('Data after robust scaling',
     RobustScaler(quantile_range=(25, 75)).fit_transform(X)),
    ('Data after power transformation (Yeo-Johnson)',
     PowerTransformer(method='yeo-johnson').fit_transform(X)),
    ('Data after power transformation (Box-Cox)',
     PowerTransformer(method='box-cox').fit_transform(X)),
    ('Data after quantile transformation (gaussian pdf)',
     QuantileTransformer(output_distribution='normal').fit_transform(X)),
    ('Data after quantile transformation (uniform pdf)',
     QuantileTransformer(output_distribution='uniform').fit_transform(X)),
    ('Data after sample-wise L2 normalizing', Normalizer().fit_transform(X)),
]

# scale the output between 0 and 1 for the colorbar
y = minmax_scale(y_full)

# plasma does not exist in matplotlib < 1.5
cmap = getattr(cm, 'plasma_r', cm.hot_r)
コード例 #25
0
import numpy as np
import matplotlib.pyplot as plt

from sklearn.preprocessing import PowerTransformer
from sklearn.preprocessing import QuantileTransformer
from sklearn.model_selection import train_test_split

print(__doc__)

N_SAMPLES = 1000
FONT_SIZE = 6
BINS = 30

rng = np.random.RandomState(304)
bc = PowerTransformer(method='box-cox')
yj = PowerTransformer(method='yeo-johnson')
qt = QuantileTransformer(output_distribution='normal', random_state=rng)
size = (N_SAMPLES, 1)

# lognormal distribution
X_lognormal = rng.lognormal(size=size)

# chi-squared distribution
df = 3
X_chisq = rng.chisquare(df=df, size=size)

# weibull distribution
a = 50
X_weibull = rng.weibull(a=a, size=size)
コード例 #26
0
maxabs_fig, ax = plt.subplots()
sn.distplot(MaxAbsScaler().fit_transform(X)).set_title('MaxAbsScaler')
maxabs_fig.savefig('Transformation-MaxAbsScaler' + '.pdf',
                   bbox_inches='tight',
                   dpi=None,
                   facecolor='w',
                   edgecolor='b',
                   orientation='portrait',
                   papertype=None,
                   format=None,
                   transparent=True,
                   pad_inches=0.25,
                   frameon=None)

PowerTrans_fig, ax = plt.subplots()
sn.distplot(PowerTransformer(
    method='yeo-johnson').fit_transform(X)).set_title('PowerTransformer')
PowerTrans_fig.savefig('Transformation-PowerTransformer' + '.pdf',
                       bbox_inches='tight',
                       dpi=None,
                       facecolor='w',
                       edgecolor='b',
                       orientation='portrait',
                       papertype=None,
                       format=None,
                       transparent=True,
                       pad_inches=0.25,
                       frameon=None)

Robust_fig, ax = plt.subplots()
sn.distplot(RobustScaler().fit_transform(X)).set_title('RobustScaler')
Robust_fig.savefig('Transformation-RobustScaler' + '.pdf',
コード例 #27
0
# Seems we want to strip 4 first columns of MFCC and 24 last of chroma
cleaned_x = pd.concat([rhythm, chroma_cleaned, mfcc_cleaned],
                      axis='columns',
                      ignore_index=True)

# Outlier detection
threshold = 3
for col in range(cleaned_x.shape[1]):
    mean = np.mean(cleaned_x.iloc[:, col])
    z = np.abs(stats.zscore(cleaned_x.iloc[:, col]))
    rows = np.where(z > threshold)
    for row in rows:
        cleaned_x.at[row, col] = mean

# Scaling
scaler = PowerTransformer()
scaled_data = scaler.fit_transform(cleaned_x)
scaled_df = pd.DataFrame(scaled_data)

cleaned_data = pd.concat([df_labels, scaled_df], axis='columns', ignore_index=True)

# Split dataset into train and test
train, test = train_test_split(cleaned_data, test_size=0.3, random_state=0)

x_train = train.drop(labels=0, axis='columns')
y_train = np.ravel(train[[0]])

x_test = test.drop(labels=0, axis='columns')
y_test = np.ravel(test[[0]])

# Build a bagging meta-estimator: best result was ~57 % with 1000 estimators and 10 max features
コード例 #28
0
        scaled[:, col] = [(x - min_) / (max_ - min_) * rng + min_val for x in arr[:, col]]

    return scaled


data_scaled_5_10 = fit_range(data)
plot_hist(data_scaled_5_10)


# NON-LINEAR TRANSFORMATION
data_scaled_quantile = QuantileTransformer(n_quantiles=100,
                                           random_state=0) \
    .fit_transform(data)
plot_hist(data_scaled_quantile)

data_scaled_quantile_normal = QuantileTransformer(n_quantiles=100,
                                                  random_state=0,
                                                  output_distribution='normal') \
    .fit_transform(data)
plot_hist(data_scaled_quantile_normal)

data_scaled_power = PowerTransformer().fit_transform(data)
plot_hist(data_scaled_power)


# FEATURE SAMPLING
sampler = KBinsDiscretizer(n_bins=[3, 4, 3, 10, 2, 4], encode='ordinal')
data_discrete = sampler.fit_transform(data)
print(sampler.bin_edges_)
plot_hist(data_discrete)
コード例 #29
0
ファイル: mymodel.py プロジェクト: msknapp/machine-learning
    def __init__(self,
                 getter: FeatureGetter = None,
                 core_type: str = 'linear-regression',
                 output_transform: str = 'log',
                 imputation_method: str = "basic",
                 scaling=None,
                 age_half_life: float = 65.0):
        if getter is None:
            getter = FeatureGetter()
        self.core_type = core_type
        self.output_transform = output_transform
        self.age_half_life = 65.0

        # these things need to be fit.
        self.feature_mergers = [
            # FeatureMerger(produces='area', source_columns=['TotalBsmtSF', 'GrLivArea'],
            #               getter=getter, operation='sum'),
            FeatureMerger(produces='baths',
                          source_columns=['BsmtFullBath', 'FullBath'],
                          getter=getter,
                          operation='sum'),
            FeatureMerger(produces='half_baths',
                          source_columns=['BsmtHalfBath', 'HalfBath'],
                          getter=getter,
                          operation='sum'),
            FeatureMerger(produces='access',
                          source_columns=['LotFrontage', 'PavedDrive'],
                          getter=getter),
            # FeatureMerger(produces='shape', source_columns=['LotShape', 'LandContour', 'LandSlope'],
            #               feature_names=self.feature_names),
            FeatureMerger(produces='utilities',
                          source_columns=['Heating', 'HeatingQC'],
                          getter=getter),
            FeatureMerger(
                produces='quality',
                source_columns=['OverallQual', 'OverallCond', 'KitchenQual'],
                getter=getter,
                morph="exp",
                morph_slope=0.1),
            FeatureMerger(produces='exterior',
                          source_columns=[
                              'LotArea', 'RoofStyle', 'RoofMatl',
                              'Exterior1st', 'Exterior2nd', 'MasVnrType',
                              'MasVnrArea', 'ExterCond'
                          ],
                          getter=getter),
            FeatureMerger(
                produces='porch',
                source_columns=['OpenPorchSF', 'EnclosedPorch', 'ScreenPorch'],
                getter=getter,
                operation='sum'),
            FeatureMerger(produces='garage',
                          source_columns=[
                              'GarageType', 'GarageYrBlt', 'GarageFinish',
                              'GarageCars', 'GarageArea', 'GarageQual',
                              'GarageCond'
                          ],
                          getter=getter),
            FeatureMerger(produces='fireplaces',
                          source_columns=['Fireplaces', 'FireplaceQu'],
                          getter=getter),
            FeatureMerger(produces='basement',
                          source_columns=[
                              'BsmtQual', 'BsmtCond', 'BsmtExposure',
                              'BsmtFinType1', 'BsmtFinType2'
                          ],
                          getter=getter),
            # FeatureMerger(produces='conditions', source_columns=['Condition1', 'Condition2'],
            #               feature_names=self.feature_names, operation='sum')
        ]
        self.imputation_method = imputation_method
        if self.imputation_method == 'neighbors':
            df = DistanceFunc(getter=getter,
                              feature_weights={
                                  "Neighborhood": 100.0,
                                  "GrLivArea": 0.01,
                                  "FullBath": 5,
                                  "BedroomAbvGr": 7,
                                  "YrSold": 10,
                              })
            self.imputer = KNNImputer(metric=df.get_distance)
        else:
            self.imputer = SimpleImputer(strategy='most_frequent')
        self.categorical_mapping = load_categorical_mapping()
        self.categorical_transform = CategoricalTransformer(
            feature_names=getter.feature_names,
            mapping=self.categorical_mapping)
        self.model = None
        self.is_fit = False
        self.getter = getter
        self.scaling = scaling
        self.scaler = None
        if self.scaling == 'robust':
            self.scaler = RobustScaler()
        elif self.scaling == 'power':
            self.scaler = PowerTransformer()
コード例 #30
0
ファイル: mymodel.py プロジェクト: msknapp/machine-learning
class MyModel(BaseEstimator, RegressorMixin):
    def __init__(self,
                 getter: FeatureGetter = None,
                 core_type: str = 'linear-regression',
                 output_transform: str = 'log',
                 imputation_method: str = "basic",
                 scaling=None,
                 age_half_life: float = 65.0):
        if getter is None:
            getter = FeatureGetter()
        self.core_type = core_type
        self.output_transform = output_transform
        self.age_half_life = 65.0

        # these things need to be fit.
        self.feature_mergers = [
            # FeatureMerger(produces='area', source_columns=['TotalBsmtSF', 'GrLivArea'],
            #               getter=getter, operation='sum'),
            FeatureMerger(produces='baths',
                          source_columns=['BsmtFullBath', 'FullBath'],
                          getter=getter,
                          operation='sum'),
            FeatureMerger(produces='half_baths',
                          source_columns=['BsmtHalfBath', 'HalfBath'],
                          getter=getter,
                          operation='sum'),
            FeatureMerger(produces='access',
                          source_columns=['LotFrontage', 'PavedDrive'],
                          getter=getter),
            # FeatureMerger(produces='shape', source_columns=['LotShape', 'LandContour', 'LandSlope'],
            #               feature_names=self.feature_names),
            FeatureMerger(produces='utilities',
                          source_columns=['Heating', 'HeatingQC'],
                          getter=getter),
            FeatureMerger(
                produces='quality',
                source_columns=['OverallQual', 'OverallCond', 'KitchenQual'],
                getter=getter,
                morph="exp",
                morph_slope=0.1),
            FeatureMerger(produces='exterior',
                          source_columns=[
                              'LotArea', 'RoofStyle', 'RoofMatl',
                              'Exterior1st', 'Exterior2nd', 'MasVnrType',
                              'MasVnrArea', 'ExterCond'
                          ],
                          getter=getter),
            FeatureMerger(
                produces='porch',
                source_columns=['OpenPorchSF', 'EnclosedPorch', 'ScreenPorch'],
                getter=getter,
                operation='sum'),
            FeatureMerger(produces='garage',
                          source_columns=[
                              'GarageType', 'GarageYrBlt', 'GarageFinish',
                              'GarageCars', 'GarageArea', 'GarageQual',
                              'GarageCond'
                          ],
                          getter=getter),
            FeatureMerger(produces='fireplaces',
                          source_columns=['Fireplaces', 'FireplaceQu'],
                          getter=getter),
            FeatureMerger(produces='basement',
                          source_columns=[
                              'BsmtQual', 'BsmtCond', 'BsmtExposure',
                              'BsmtFinType1', 'BsmtFinType2'
                          ],
                          getter=getter),
            # FeatureMerger(produces='conditions', source_columns=['Condition1', 'Condition2'],
            #               feature_names=self.feature_names, operation='sum')
        ]
        self.imputation_method = imputation_method
        if self.imputation_method == 'neighbors':
            df = DistanceFunc(getter=getter,
                              feature_weights={
                                  "Neighborhood": 100.0,
                                  "GrLivArea": 0.01,
                                  "FullBath": 5,
                                  "BedroomAbvGr": 7,
                                  "YrSold": 10,
                              })
            self.imputer = KNNImputer(metric=df.get_distance)
        else:
            self.imputer = SimpleImputer(strategy='most_frequent')
        self.categorical_mapping = load_categorical_mapping()
        self.categorical_transform = CategoricalTransformer(
            feature_names=getter.feature_names,
            mapping=self.categorical_mapping)
        self.model = None
        self.is_fit = False
        self.getter = getter
        self.scaling = scaling
        self.scaler = None
        if self.scaling == 'robust':
            self.scaler = RobustScaler()
        elif self.scaling == 'power':
            self.scaler = PowerTransformer()

    def get_params(self, deep=True):
        return {
            "core_type": self.core_type,
            "output_transform": self.output_transform,
            "age_half_life": self.age_half_life,
        }

    def predict(self, x):
        x2 = self.transform(x)
        y2 = self.model.predict(x2)
        y = y2
        if self.output_transform == 'log':
            y = np.exp(y2)
        return y

    def _setup(self, x):
        x0 = x.copy() if not isinstance(x, pd.DataFrame) else x.to_numpy()
        x1 = self.categorical_transform.transform(
            x0, exclude_columns=['YrSold', 'MoSold', 'YearBuilt'])
        return x1

    def _preprocess(self,
                    x: np.ndarray,
                    y: np.ndarray = None,
                    operation: str = 'fit'):
        overall_quality = self.getter.get_column('OverallQual', x)
        overall_quality = overall_quality.astype(float)
        x = self._setup(x)
        if operation == 'fit':
            x = self.imputer.fit_transform(x)
        else:
            x = self.imputer.transform(x)
        year_sold = self.getter.get_column('YrSold', x)
        month_sold = self.getter.get_column('MoSold', x)
        year_built = self.getter.get_column('YearBuilt', x)
        age = year_sold - year_built

        # TODO a column for the seasonal effect of things
        merged_columns = []
        for merger in self.feature_mergers:
            op = 'fit_transform' if 'fit' in operation else 'transform'
            merged_column = merger._process(x, operation=op)
            merged_columns.append(merged_column)
        merged_columns = np.array(merged_columns).T
        bedrooms = self.getter.get_column("BedroomAbvGr", x)

        year_built = normalize(year_built)
        age = normalize(age)
        bedrooms = normalize(bedrooms)

        # foundation seems to make it worse.
        misc = self.getter.get_column('MiscVal', x)
        misc = normalize(misc)
        neighborhood = self.getter.get_column('Neighborhood', x)
        # zoning seems to help the prediction
        zoning = self.getter.get_column('MSZoning', x)
        pool = self.getter.get_column('PoolArea', x)
        has_pool = np.array(list(map(lambda t: 1.0 if t > 0.0 else 0.0, pool)))
        building_type = self.getter.get_column('BldgType', x)
        house_style = self.getter.get_column('HouseStyle', x)
        functional = self.getter.get_column('Functional', x)
        central_air = self.getter.get_column("CentralAir", x)
        electrical = self.getter.get_column("Electrical", x)
        twoflr = self.getter.get_column("2ndFlrSF", x)
        has_two_floors = np.array(
            list(map(lambda t: 1.0 if t > 0.0 else 0.0, twoflr)))

        grla = self.getter.get_column('GrLivArea', x)
        bsma = self.getter.get_column('TotalBsmtSF', x)
        total_area = (grla + bsma)
        area_feature = total_area / 6000.0
        tmp = overall_quality / 10.0
        quality_feature = np.exp(tmp) / math.e

        yr_sold = self.getter.get_column('YrSold', x)
        yr_sold = yr_sold.astype(float)
        yr_built = self.getter.get_column('YearBuilt', x)
        yr_built = yr_built.astype(float)
        age = yr_sold - yr_built
        age_feature = np.exp(-age / self.age_half_life)
        combined = age_feature * quality_feature * area_feature
        combined2 = quality_feature * area_feature

        input_data = [
            combined, combined2, age_feature, quality_feature, area_feature,
            neighborhood, bedrooms, merged_columns, misc, building_type,
            house_style
        ]  #, functional, central_air, electrical, month_sold,
        # has_pool, zoning, has_two_floors]
        x3 = np.column_stack(input_data)

        # The robust scaler seems to be helping the accuracy.
        if self.scaler is not None:
            if 'fit' in operation:
                self.scaler.fit(x3)
            if 'transform' in operation:
                x3 = self.scaler.transform(x3)

        if operation == 'transform':
            return x3
        # if self.core_type == 'linear-regression':
        #     core = LinearRegression()
        # elif self.core_type == 'elastic-net':
        #     core = ElasticNet()
        # elif self.core_type == 'perceptron':
        #     core = MLPRegressor(max_iter=1500)

        core1 = ElasticNet()

        self.model = GradientBoostingRegressor(init=core1,
                                               n_estimators=100,
                                               loss='huber')
        y2 = y.copy()
        if self.output_transform == 'log':
            y2 = np.log(y2)
        self.model.fit(x3, y2)
        self.is_fit = True
        return self

    def fit(self, x, y):
        return self._preprocess(x, y, operation='fit')

    def transform(self, x):
        return self._preprocess(x, operation='transform')
コード例 #31
0
ファイル: plotting.py プロジェクト: isaacovercast/PTA
def plot_simulations_pca(sims, ax='',\
                            figsize=(8, 8),\
                            target='',\
                            feature_set='',\
                            loadings=False,\
                            nsims=1000,\
                            select='',\
                            tol='',\
                            title='',\
                            outfile='',\
                            colorbar=True,\
                            verbose=False):
    """
    Plot summary statistics for simulations projected into PC space.

    :param str sims: 
    :param matplotlib.pyplot.axis ax:
    :param tuple figsize:
    :param str target:
    :param list feature_set:
    :param bool loadings: BROKEN! Whether to plot the loadings in the figure.
    :param int nsims:
    :param int/float select: 
    :param int/float tol:
    :param str title:
    :param str outfile:
    :param bool verbose:

    :return: Return the `matplotlib.pyplot.axis` on which the simulations are
        plotted.
    """
    if not ax:
        fig, ax = plt.subplots(figsize=figsize)

    ## Filter and downsample the simulations
    sim_df = _filter_sims(sims,\
                            feature_set=feature_set,\
                            nsims=nsims,\
                            select=select,\
                            tol=tol,\
                            verbose=verbose)

    ## Have to retain the targets because we drop them prior to PCA
    target_df = sim_df[default_targets]
    sim_df = sim_df.drop(default_targets, axis=1)

    ## These are also left over from mess and not sure they are needed.
    # sim_df = StandardScaler().fit_transform(sim_df)
    sim_df = PowerTransformer(method='yeo-johnson').fit_transform(sim_df)

    pca = PCA(n_components=2)
    dat = pca.fit_transform(sim_df)

    if not target:
        target = "zeta"
    target_values = target_df[target].values
    sc = ax.scatter(dat[:, 0],
                    dat[:, 1],
                    label=target_df[target],
                    c=target_values)

    if colorbar:
        plt.colorbar(sc)

    ## Remove a bunch of visual noise
    ax.set_yticklabels([])
    ax.set_xticklabels([])
    ax.tick_params(top='off', bottom='off', left='off', right='off')

    var_expl = pca.explained_variance_ratio_
    ax.set_xlabel("Variance explained {:.3}%".format(var_expl[0] * 100),
                  fontsize=15)
    ax.set_ylabel("Variance explained {:.3}%".format(var_expl[1] * 100),
                  fontsize=15)

    if title:
        ax.set_title(title)

    ## TODO: Doesn't work how I'd like.
    ##print("Explained variance", pca.explained_variance_ratio_)
    ##if loadings:
    ##    for i, comp in enumerate(pca.components_.T):
    ##        plt.arrow(0, 0, pca.components_.T[i,0], pca.components_.T[i,1], color = 'r',alpha = 0.5)
    ##        plt.text(pca.components_.T[i,0]* 1.5, pca.components_.T[i,1] * 1.5, dat[i+2], color = 'black', ha = 'center', va = 'center')

    ## If writing to file then don't plot to screen.
    if outfile:
        try:
            plt.savefig(outfile)
            if verbose: print("Wrote figure to: {}".format(outfile))
        except Exception as inst:
            raise Exception("Failed saving figure: {}".format(inst))
        plt.close()

    return ax
コード例 #32
0
import matplotlib.pyplot as plt

from sklearn.preprocessing import PowerTransformer
from sklearn.preprocessing import QuantileTransformer
from sklearn.model_selection import train_test_split

print(__doc__)


N_SAMPLES = 1000
FONT_SIZE = 6
BINS = 30


rng = np.random.RandomState(304)
bc = PowerTransformer(method='box-cox')
yj = PowerTransformer(method='yeo-johnson')
qt = QuantileTransformer(output_distribution='normal', random_state=rng)
size = (N_SAMPLES, 1)


# lognormal distribution
X_lognormal = rng.lognormal(size=size)

# chi-squared distribution
df = 3
X_chisq = rng.chisquare(df=df, size=size)

# weibull distribution
a = 50
X_weibull = rng.weibull(a=a, size=size)