コード例 #1
0
def power_transformer(ss: pd.Series):
    """
    Version of from_sklearn() dedicated to sklearn.PowerTransformer solely,
    only to have a good name of the function (used later in some important plots).
    """
    transformer = PowerTransformer()

    ss, sname, idx = process_ss(ss)

    try:
        ss = transformer.transform(ss)[:, 0]
    except NotFittedError:
        print(f"! fitting parameters for PowerTransformer on variable {sname}")
        ss = transformer.fit_transform(ss)[:, 0]

    ss = pd.Series(ss, index=idx)
    ss.name = sname

    t_name = transformer.get_params()['method'].title().replace("-", "") + \
        "_{{\\lambda = {}}}".format(round(transformer.lambdas_[0], 2))

    return ss, from_sklearn(transformer, t_name)
コード例 #2
0
ファイル: test.py プロジェクト: shogun-toolbox/applications
def power_transform():
    path = Path.cwd()
    cleaned_data_path = path.parent / 'data' / 'cleaned'

    df = {}
    lmbda = {
        'austria': {},
        'belgium': {},
        'germany': {},
        'italy': {},
        'netherlands': {}
    }

    for country in COUNTRIES:
        # read file
        file_path = cleaned_data_path / (country + '.csv')
        df[country] = pd.read_csv(file_path)

        process.add_polynomial_features(country, df, 10)

        numerical_features = df[country].select_dtypes(
            exclude=["object"]).columns
        numerical_features = numerical_features.drop('incidence')
        skewness = df[country][numerical_features].apply(lambda x: skew(x))
        skewness = skewness[abs(skewness) > 0.5]
        skewed_features = skewness.index

        process.hot_encode_weeks(country, df)

        train = df[country].sample(frac=0.8, random_state=200)
        test = df[country].drop(train.index)

        train = train.sort_values(by="date")
        test = test.sort_values(by="date")
        train = train.drop(columns=['week', 'date'])
        test = test.drop(columns=['week', 'date'])

        y_train = pd.Series(train['incidence']).to_frame('incidence')
        y_test = pd.Series(test['incidence']).to_frame('incidence')
        X_train = train.drop(columns=['incidence'])
        X_test = test.drop(columns=['incidence'])

        pt = PowerTransformer()
        pt.fit_transform(X_train, y_train.values)
        params = pt.get_params()
        pt.fit_transform(X_test)

        means = {}
        std_deviations = {}
        process.train_std_normal(X_train, numerical_features, means,
                                 std_deviations)

        process.apply_std_normal(X_test, numerical_features, means,
                                 std_deviations)

        test_data_path = path.parent / 'data' / 'test' / country

        x_train_file_path = test_data_path / 'X_train.csv'
        y_train_file_path = test_data_path / 'y_train.csv'
        x_test_file_path = test_data_path / 'X_test.csv'
        y_test_file_path = test_data_path / 'y_test.csv'

        X_train.to_csv(x_train_file_path, index=False)
        y_train.to_csv(y_train_file_path, index=False)
        X_test.to_csv(x_test_file_path, index=False)
        y_test.to_csv(y_test_file_path, index=False)