Ejemplo n.º 1
0
def normalize_cols(tr, val, train, test, cols):
    qnt = QuantileTransformer(output_distribution="normal")
    tr[cols] = qnt.fit_transform(tr[cols]).astype(np.float32)
    val[cols] = qnt.transform(val[cols]).astype(np.float32)

    train[cols] = qnt.fit_transform(train[cols]).astype(np.float32)
    test[cols] = qnt.transform(test[cols]).astype(np.float32)
Ejemplo n.º 2
0
def quantile_scaler(train, validate, test):
    '''
    Accepts three dataframes and applies QuantileTransform() to convert values in each dataframe
    to a uniform distribution. 
    Columns containing object data types are dropped, as strings cannot be directly scaled.

    Parameters (train, validate, test) = three dataframes being scaled
    
    Returns (scaler, train_scaled, validate_scaled, test_scaled)
    '''
    # Remove columns with object data types from each dataframe
    train = train.select_dtypes(exclude=['object'])
    validate = validate.select_dtypes(exclude=['object'])
    test = test.select_dtypes(exclude=['object'])
    # Fit the scaler to the train dataframe
    scaler = QuantileTransformer().fit(train)
    # Transform the scaler onto the train, validate, and test dataframes
    train_scaled = pd.DataFrame(scaler.transform(train),
                                columns=train.columns.values).set_index(
                                    [train.index.values])
    validate_scaled = pd.DataFrame(scaler.transform(validate),
                                   columns=validate.columns.values).set_index(
                                       [validate.index.values])
    test_scaled = pd.DataFrame(scaler.transform(test),
                               columns=test.columns.values).set_index(
                                   [test.index.values])
    return scaler, train_scaled, validate_scaled, test_scaled
Ejemplo n.º 3
0
def normalize(trn, val, test):
    """
    Performs quantile normalization on the train, test and validation data. The QuantileTransformer
    is fitted on the train data, and transformed on test and validation data.
    
    Args:
            trn: train data - pandas dataframe.
            val: validation data - pandas dataframe.
            test: test data - pandas dataframe.
    
    Returns:
            trn_norm: normalized train data - pandas dataframe.
            val_norm: normalized validation - pandas dataframe.
            test_norm: normalized test data - pandas dataframe.
    """
    norm_model = QuantileTransformer(n_quantiles=100,
                                     random_state=0,
                                     output_distribution="normal")
    trn_norm = pd.DataFrame(norm_model.fit_transform(trn),
                            index=trn.index,
                            columns=trn.columns)
    val_norm = pd.DataFrame(norm_model.transform(val),
                            index=val.index,
                            columns=val.columns)
    tst_norm = pd.DataFrame(norm_model.transform(test),
                            index=test.index,
                            columns=test.columns)
    return trn_norm, val_norm, tst_norm
Ejemplo n.º 4
0
def quantile_transformer(train_features, test_features, features, n_quantiles=100, output_distribution='normal'):
    log = logging.getLogger(f"{__name__}.{inspect.currentframe().f_code.co_name}")
    log.info("Start.one_experiment")
    train_features = train_features.copy()
    test_features = test_features.copy()

    ##################################################
    # RankGauss - transform to Gauss
    ##################################################

    log.debug(f"Prearation data transform.\ntrain_features.shape: {train_features.shape}")
    for col in tqdm(features, 'QuantileTransformer', leave=False):
        # kurt = max(kurtosis(train_features[col]), kurtosis(test_features[col]))
        # QuantileTransformer_n_quantiles = n_quantile_for_kurt(kurt, calc_QT_par_kurt(QT_n_quantile_min, QT_n_quantile_max))
        # transformer = QuantileTransformer(n_quantiles=QuantileTransformer_n_quantiles,random_state=0, output_distribution="normal")

        transformer = QuantileTransformer(n_quantiles=n_quantiles, random_state=0,
                                          output_distribution=output_distribution)  # from optimal commit 9
        vec_len = len(train_features[col].values)
        vec_len_test = len(test_features[col].values)
        raw_vec = train_features[col].values.reshape(vec_len, 1)
        transformer.fit(raw_vec)

        train_features[col] = transformer.transform(raw_vec).reshape(1, vec_len)[0]
        test_features[col] = \
            transformer.transform(test_features[col].values.reshape(vec_len_test, 1)).reshape(1, vec_len_test)[0]

    gc.collect()
    return train_features, test_features
Ejemplo n.º 5
0
def rankGauss(dfTrain, dfTest=None, n_quantiles=100, random_state=0):
    #transformer定義
    transformer = QuantileTransformer(n_quantiles=n_quantiles,
                                      random_state=random_state,
                                      output_distribution="normal")

    #データ数, 列名
    vec_len = len(dfTrain.values)
    clmnNmTrain = dfTrain.columns.values[0]
    if dfTest is not None:
        vec_len_test = len(dfTest.values)
        clmnNmTest = dfTest.columns.values[0]

    #fitting
    raw_vec = dfTrain.values.reshape(vec_len, 1)
    transformer.fit(raw_vec)

    #変換
    dfTrain = transformer.transform(raw_vec).reshape(1, vec_len)[0]
    if dfTest is not None:
        raw_vec_test = dfTest.values.reshape(vec_len_test, 1)
        dfTest = transformer.transform(raw_vec_test).reshape(1,
                                                             vec_len_test)[0]

    if dfTest is not None:
        return pd.DataFrame(dfTrain,
                            columns=[clmnNmTrain
                                     ]), pd.DataFrame(dfTest,
                                                      columns=[clmnNmTest])
    else:
        return pd.DataFrame(dfTrain, columns=[clmnNmTrain]), None
def rank_gauss(train_features, test_features):

    train_features_ = train_features.copy()
    test_features_ = test_features.copy()

    GENES = [col for col in train_features_.columns if col.startswith('g-')]
    CELLS = [col for col in train_features_.columns if col.startswith('c-')]

    for col in (GENES + CELLS):

        transformer = QuantileTransformer(n_quantiles=100,
                                          random_state=0,
                                          output_distribution="normal")
        vec_len = len(train_features_[col].values)
        vec_len_test = len(test_features_[col].values)
        raw_vec = train_features_[col].values.reshape(vec_len, 1)
        transformer.fit(raw_vec)

        train_features_[col] = transformer.transform(raw_vec).reshape(
            1, vec_len)[0]
        test_features_[col] = transformer.transform(
            test_features_[col].values.reshape(vec_len_test,
                                               1)).reshape(1, vec_len_test)[0]

    return train_features_, test_features_
Ejemplo n.º 7
0
def uniform_scaler(train, validate, test):
    '''
    Accepts three dataframes and applies a non-linear transformer to convert values in each dataframe
    to a standard distribution. This will distort correlations and distances within and across features.. 
    Columns containing object data types are dropped, as strings cannot be directly scaled.

    Parameters (train, validate, test) = three dataframes being scaled
    
    Returns (scaler, train_scaled, validate_scaled, test_scaled)
    '''
    train = train.select_dtypes(exclude=['object'])
    validate = validate.select_dtypes(exclude=['object'])
    test = test.select_dtypes(exclude=['object'])
    scaler = QuantileTransformer(n_quantiles=100,
                                 output_distribution='uniform',
                                 random_state=123,
                                 copy=True).fit(train)
    train_scaled = pd.DataFrame(scaler.transform(train),
                                columns=train.columns.values).set_index(
                                    [train.index.values])
    validate_scaled = pd.DataFrame(scaler.transform(validate),
                                   columns=validate.columns.values).set_index(
                                       [validate.index.values])
    test_scaled = pd.DataFrame(scaler.transform(test),
                               columns=test.columns.values).set_index(
                                   [test.index.values])
    return scaler, train_scaled, validate_scaled, test_scaled
Ejemplo n.º 8
0
def rankGauss(train_features, test_features, runty, test_features_p=None):

    GENES = [col for col in train_features.columns if col.startswith('g-')]
    CELLS = [col for col in train_features.columns if col.startswith('c-')]

    for col in (GENES + CELLS):

        transformer = QuantileTransformer(n_quantiles=100,
                                          random_state=0,
                                          output_distribution='normal')
        vec_len = len(train_features[col].values)
        vec_len_test = len(test_features[col].values)
        raw_vec = train_features[col].values.reshape(vec_len, 1)
        transformer.fit(raw_vec)

        train_features[col] = transformer.transform(raw_vec).reshape(
            1, vec_len)[0]
        test_features[col] = transformer.transform(
            test_features[col].values.reshape(vec_len_test,
                                              1)).reshape(1, vec_len_test)[0]

        vec_len_test_p = len(test_features_p[col].values)
        test_features_p[col] = transformer.transform(
            test_features_p[col].values.reshape(vec_len_test_p,
                                                1)).reshape(1,
                                                            vec_len_test_p)[0]

    return train_features, test_features, test_features_p
Ejemplo n.º 9
0
def scale(train, test, cols, scaler='standard'):
    if scaler == 'uniform':
        scaler = QuantileTransformer(output_distribution='uniform',
                                     random_state=123,
                                     copy=True).fit(train[cols])
    elif scaler == 'robust':
        scaler = RobustScaler(quantile_range=(25.0, 75.0),
                              copy=True,
                              with_centering=True,
                              with_scaling=True).fit(train[cols])
    elif scaler == 'gaussian':
        scaler = PowerTransformer(method='yeo-johnson',
                                  standardize=False,
                                  copy=True).fit(train[cols])
    elif scaler == 'minmax':
        scaler = MinMaxScaler(copy=True, feature_range=(0, 1)).fit(train[cols])
    elif scaler == 'standard':
        scaler = StandardScaler(copy=True, with_mean=True,
                                with_std=True).fit(train[cols])
    else:
        print("WARNING: INVALID SCALER")
        return None, train, test

    train_scaled = pd.DataFrame(scaler.transform(train[cols]),
                                columns=cols).set_index([train.index.values])
    train = train.drop(columns=cols)
    train = train.join(train_scaled)

    test_scaled = pd.DataFrame(scaler.transform(test[cols]),
                               columns=cols).set_index([test.index.values])
    test = test.drop(columns=cols)
    test = test.join(test_scaled)
    return scaler, train, test
Ejemplo n.º 10
0
class LinearRegression:
    def __init__(self, random_seed=82):
        self.random_seed = random_seed
        self.transformer_params = {'random_state': self.random_seed + 1}
        self.transformer = QuantileTransformer(**self.transformer_params)
        self.model_params = {'max_iter': 1000, 'random_state': self.random_seed + 2}
        self.model = None

    def train(self, data, label, ds=None, train_tl=200):
        start_time = time.time()
        self.fillna_values = data.mean()
        data.fillna(self.fillna_values, inplace=True)
        self.model = Lasso(**self.model_params, alpha=0.1)
        self.model.fit(self.transformer.fit_transform(data), label)
        model_train_time = time.time() - start_time

        try:  # search
            if ds is not None:
                data['ds'] = ds
                cv = TimeSeriesCV(n_splits=min(6, data.shape[0] // 30))
                folds = list(cv.split(data))
                data.drop('ds', axis=1, inplace=True)
            else:
                cv = KFold(n_splits=3, shuffle=True, random_state=self.random_seed + 3)
                folds = list(cv.split(data))

            n_alphas = int(min(35, (train_tl - 2 * model_train_time) / (model_train_time * len(folds))))
            lasso_alpha, lasso_rmse = self._search_params(data, label, model=Lasso, search_space=np.logspace(-2, 0, n_alphas), folds=folds)
            Model, best_alpha = Lasso, lasso_alpha

            n_alphas = int(min(10, (train_tl - (time.time() - start_time) - model_train_time) / (model_train_time * 1.5 * len(folds))))
            if n_alphas > 2:
                ridge_alpha, ridge_rmse = self._search_params(data, label, model=Ridge, search_space=np.logspace(-2, 2, n_alphas), folds=folds)
                if lasso_rmse * 0.99 < ridge_rmse:
                    best_alpha = ridge_alpha
                    Model = Ridge

            self.model_params.update({'alpha': best_alpha, 'random_state': self.random_seed + 4})
            self.model = Model(**self.model_params)
            self.model.fit(self.transformer.transform(data), label)
        except:
            pass

    def predict(self, data):
        data = self.transformer.transform(data.fillna(self.fillna_values))
        preds = self.model.predict(data)

        return preds

    def _search_params(self, data, label, model, search_space, folds=3, scorer=None):
        scorer = scorer or make_scorer(_rmse, greater_is_better=False)
        pipeline = Pipeline([
            ('t', QuantileTransformer(**self.transformer_params)),
            ('m', model(**self.model_params))
        ])
        gs = GridSearchCV(pipeline, {'m__alpha': search_space}, scoring=scorer, cv=folds)
        gs.fit(data, label)

        return gs.best_params_['m__alpha'], gs.best_score_
Ejemplo n.º 11
0
def qnt_transform(train, test):
    cont_feats = BUREAU_CONFIG["qnt_cols"]
    data = pd.concat([train[cont_feats], test[cont_feats]])
    scaler = QuantileTransformer(output_distribution="normal",
                                 n_quantiles=2000)
    scaler.fit(data)
    train_qnt = scaler.transform(train[cont_feats])
    test_qnt = scaler.transform(test[cont_feats])
    return train_qnt, test_qnt
Ejemplo n.º 12
0
def quantile_scaler(X_train, X_test):
    scaler = QuantileTransformer(n_quantiles=1000,output_distribution='normal',random_state=123,copy=True).fit(X_train)

    scaled_X_train = scaler.transform(X_train)
    scaled_X_train = pd.DataFrame(scaled_X_train, columns=X_train.columns).set_index([X_train.index])
    
    scaled_X_test = scaler.transform(X_test)
    scaled_X_test = pd.DataFrame(scaled_X_test, columns=X_test.columns).set_index([X_test.index])
    return scaled_X_train, scaled_X_test, scaler
Ejemplo n.º 13
0
def quantile_transformer(dataset, quantiles):
    train_set, test_set = split_train_test(dataset, percent_train)
    scaler = QuantileTransformer(n_quantiles = quantiles)
    scaler.fit(train_set)
    scaled_train_set = pd.DataFrame(scaler.transform(train_set), columns = colnames)
    scaled_test_set = pd.DataFrame(scaler.transform(test_set), columns = colnames)
    scaled_df = pd.concat([scaled_train_set, scaled_test_set])
    X = scaled_df[predictors]
    Y = scaled_df[target]
    return X, Y, scaler
Ejemplo n.º 14
0
def normal_scaler(train, test, seed=123):
    """Quantile transformer, non_linear transformation - normal
       Takes in a train and test set of data,
       creates and fits a scaler to the train set,
       returns the scaler, train_scaled, test_scaled
    """
    scaler = QuantileTransformer(n_quantiles=100, output_distribution='normal', random_state=seed, copy=True).fit(train)
    train_scaled = pd.DataFrame(scaler.transform(train), columns=train.columns.values).set_index([train.index.values]) 
    test_scaled= pd.DataFrame(scaler.transform(test), columns=test.columns.values).set_index([test.index.values])
    return scaler, train_scaled, test_scaled
Ejemplo n.º 15
0
def uniform_scaler(train, test):
    scaler = QuantileTransformer()
    scaler.fit(train)
    train = pd.DataFrame(scaler.transform(train),
                         columns=train.columns.values).set_index(
                             [train.index.values])
    test = pd.DataFrame(scaler.transform(test),
                        columns=test.columns.values).set_index(
                            [test.index.values])
    return scaler, train, test
Ejemplo n.º 16
0
def uniform_scaler(train, test, cols):
    scaler = QuantileTransformer(output_distribution='uniform', random_state=123, copy=True).fit(train[cols])

    train_scaled = pd.DataFrame(scaler.transform(train[cols]), columns=cols).set_index([train.index.values])
    train = train.drop(columns=cols)
    train = train.join(train_scaled)

    test_scaled = pd.DataFrame(scaler.transform(test[cols]), columns=cols).set_index([test.index.values])
    test = test.drop(columns=cols)
    test = test.join(test_scaled)
    return scaler, train, test
Ejemplo n.º 17
0
def uniform_scaler(train, test):
    """Quantile transformer, non_linear transformation - uniform.
       Reduces the impact of outliers, smooths out unusual distributions.
       Takes in a train and test set of data,
       creates and fits a scaler to the train set,
       returns the scaler, train_scaled, test_scalexsd
    """
    scaler = QuantileTransformer(n_quantiles=100, output_distribution='uniform', random_state=123, copy=True).fit(train)
    train_scaled = pd.DataFrame(scaler.transform(train), columns=train.columns.values).set_index([train.index.values]) 
    test_scaled = pd.DataFrame(scaler.transform(test), columns=test.columns.values).set_index([test.index.values])
    return scaler, train_scaled, test_scaled
Ejemplo n.º 18
0
def scale_data(train, test, feats):
    scaler = QuantileTransformer(output_distribution="normal",
                                 n_quantiles=2000,
                                 subsample=5e5,
                                 random_state=12345786)
    df_all = pd.concat([train[feats], test[feats]], axis=0)
    scaler.fit(df_all)
    qnt_feats = [f + "_qnt" for f in feats]
    train_qnt = pd.DataFrame(scaler.transform(train[feats]), columns=qnt_feats)
    test_qnt = pd.DataFrame(scaler.transform(test[feats]), columns=qnt_feats)
    return train_qnt, test_qnt
def uniform_scaler(train, test, seed=123):
    scaler = QuantileTransformer(n_quantiles=100,
                                 output_distribution='uniform',
                                 random_state=seed,
                                 copy=True).fit(train)
    train_scaled = pd.DataFrame(scaler.transform(train),
                                columns=train.columns.values).set_index(
                                    [train.index.values])
    test_scaled = pd.DataFrame(scaler.transform(test),
                               columns=test.columns.values).set_index(
                                   [test.index.values])
    return scaler, train_scaled, test_scaled
Ejemplo n.º 20
0
def quantile_transform(X_train, X_valid, X_test, columns):
    t = QuantileTransformer()
    t.fit(X_train[:, columns])
    qX_train = t.transform(X_train[:, columns])
    qX_valid = t.transform(X_valid[:, columns]) \
        if X_valid is not None else None
    qX_test = t.transform(X_test[:, columns]) if X_test is not None else None
    if X_valid is not None:
        X_train[:, columns] = qX_train
        X_valid[:, columns] = qX_valid
        X_test[:, columns] = qX_test
        return X_train, X_valid, X_test
    else:
        return X_train
Ejemplo n.º 21
0
class _DistTransformer:

    TRANSFORMS = {
        'standard', 'min-max',
        'box-cox', 'yeo-johnson',
        'rankgauss'
    }

    def __init__(self, transform='standard'):
        assert transform in self.TRANSFORMS
        self.t = transform

    def fit(self, X: pd.Series, y=None) -> None:
        if self.t == 'standard':
            self.transformer = StandardScaler()
        elif self.t == 'min-max':
            self.transformer = MinMaxScaler()
        elif self.t == 'box-cox':
            self.transformer = PowerTransformer(method='box-cox')
        elif self.t == 'yeo-johnson':
            self.transformer = PowerTransformer(method='yeo-johnson')
        elif self.t == 'rankgauss':
            self.transformer = QuantileTransformer(
                n_quantiles=len(X), random_state=0,
                output_distribution='normal')
        else:
            raise ValueError(self.transform)

        if isinstance(X, pd.Series):
            self.transformer.fit(X.values.reshape(-1, 1))
        elif isinstance(X, np.ndarray):
            self.transformer.fit(X.reshape(-1, 1))
        else:
            raise TypeError(type(X))

    def transform(self, X: pd.Series) -> np.ndarray:
        if isinstance(X, pd.Series):
            return self.transformer.transform(X.values.reshape(-1, 1))
        elif isinstance(X, np.ndarray):
            return self.transformer.transform(X.reshape(-1, 1))
        else:
            raise TypeError(type(X))
    
    def fit_transform(self, X: pd.Series) -> np.ndarray:
        self.fit(X)
        return self.transform(X)

    def copy(self):
        return copy(self)
Ejemplo n.º 22
0
def uniform_scaler(x_train, x_test):
    u_scaler_x_train = QuantileTransformer(
        n_quantiles=100,
        output_distribution='uniform',
        random_state=123,
        copy=True).fit(x_train[['monthly_charges', 'tenure']])

    u_train_x_scaled = pd.DataFrame(u_scaler_x_train.transform(x_train),
                                    columns=x_train.columns.values).set_index(
                                        [x_train.index.values])
    u_test_x_scaled = pd.DataFrame(u_scaler_x_train.transform(x_test),
                                   columns=x_test.columns.values).set_index(
                                       [x_test.index.values])

    return u_train_x_scaled, u_test_x_scaled
Ejemplo n.º 23
0
    def scale(self, use_quantile_transformer=False):
        if self.verbose:
            print 'Scaling features ...'

        if use_quantile_transformer:
            scaler = QuantileTransformer(n_quantiles=10, random_state=0)
        else:
            #scaler = RobustScaler()
            scaler = StandardScaler()

        columns = self.df_train.columns
        scaler.fit(self.df_train[columns])

        self.df_train[columns] = scaler.transform(self.df_train[columns])
        self.df_test[columns] = scaler.transform(self.df_test[columns])
Ejemplo n.º 24
0
def uniform_scaler(train_data, test_data):
    '''
    takes in test data and train data and fits them both
    '''
    scaler = QuantileTransformer(n_quantiles=100,
                                 output_distribution='uniform',
                                 random_state=123,
                                 copy=True).fit(train_data)
    test_scaled = pd.DataFrame(scaler.transform(test_data),
                               columns=test_data.columns,
                               index=test_data.index)
    train_scaled = pd.DataFrame(scaler.transform(train_data),
                                columns=train_data.columns,
                                index=train_data.index)
    return scaler, train_scaled, test_scaled
Ejemplo n.º 25
0
def augment_quantiled(X_train, X_valid, X_test, columns):
    t = QuantileTransformer()
    t.fit(X_train[:, columns])
    qX_train = t.transform(X_train[:, columns])
    qX_valid = t.transform(X_valid[:, columns]) \
        if X_valid is not None else None
    qX_test = t.transform(X_test[:, columns]) if X_test is not None else None
    mX_train, mX_valid, mX_test = min_max_scale(X_train, X_valid, X_test)
    X_train = np.concatenate((mX_train, qX_train), axis=1)
    if qX_valid is None:
        return X_train
    else:
        X_valid = np.concatenate((mX_valid, qX_valid), axis=1)
        X_test = np.concatenate((mX_test, qX_test), axis=1)
        return X_train, X_valid, X_test
Ejemplo n.º 26
0
def uniform_scaler(train_data, test_data):
    # Creates a Uniform Scaler object and fit Train Data
    uniform_scaler = QuantileTransformer(n_quantiles=100,
                                         output_distribution="uniform",
                                         random_state=123,
                                         copy=True).fit(train_data)
    # Scale Train Data and Convert to a Data Frame
    scaled_train = uniform_scaler.transform(train_data)
    scaled_train = pd.DataFrame(
        scaled_train, columns=train_data.columns).set_index([train_data.index])
    # Scale Train and Convert to a Data Frame
    scaled_test = uniform_scaler.transform(test_data)
    scaled_test = pd.DataFrame(
        scaled_test, columns=test_data.columns).set_index([test_data.index])
    return scaled_train, scaled_test, uniform_scaler
Ejemplo n.º 27
0
class SampleWeight(object):
    def __init__(self, central_feat, verbose=0):
        assert isinstance(central_feat, pd.DataFrame)
        assert central_feat.shape[0] == 1, central_feat.shape
        self.central_feat = central_feat
        self.verbose = verbose
        self.sc = QuantileTransformer()

    def weight(self, x_sub, central_feat, cols):
        if self.verbose:
            for c in cols:
                s = x_sub[c] - central_feat[c].values[0]
                s /= 1. + np.mean(x_sub[c])
                x_sub[c].plot()
                plt.axhline(central_feat[c].values[0])
                plt.title(c)
                plt.show()

        diff = [
            abs(x_sub[c] - central_feat[c].values[0]) /
            (1. + np.mean(x_sub[c])) for c in cols
        ]

        n = x_sub.shape[0]
        weight = np.log(2 + np.array([i
                                      for i in range(n)])) / (len(cols) * .1 +
                                                              sum(diff))

        return weight

    def scale_weight(self, x, fit=False, cols=None):
        assert np.all(x.columns == self.central_feat.columns)
        if cols is None:
            cols = x.columns
        x_sub = x[cols].copy()

        if fit:
            self.sc.fit(x_sub)

        x_sub_sc = self.sc.transform(x_sub)
        x_sub_sc = pd.DataFrame(x_sub_sc, columns=cols)

        central_sub = self.central_feat[cols].copy()
        central_feat_sc = self.sc.transform(central_sub)
        central_feat_sc = pd.DataFrame(central_feat_sc, columns=cols)

        weight = self.weight(x_sub_sc, central_feat_sc, cols)
        return weight
Ejemplo n.º 28
0
def rankGauss(train, test, col):
    transformer = QuantileTransformer(n_quantiles=100,
                                      random_state=0,
                                      output_distribution="normal")
    train[col] = transformer.fit_transform(train[col].values)
    test[col] = transformer.transform(test[col].values)
    return train, test
def scale_filtration_quantile(graphs, attribute="f"):
    """
    Scale the filtration values of the graphs, so that they are uniformly distributed between 0 and 1.
    
    Parameters
    ----------
    graphs:
        A list of graphs
    attribute:
        Attribute where the value for the filtration is stored
    """
    values = []
    for graph in graphs:
        values += graph.vs[attribute]
    values = np.reshape(values, (-1, 1))
    scaler = QuantileTransformer()
    scaler.fit(values)

    scaled_graphs = []
    for graph in graphs:
        graph = ig.Graph.copy(graph)
        node_values = graph.vs[attribute]
        node_values = np.reshape(node_values, (-1, 1))
        scaled_node_values = scaler.transform(node_values)
        graph.vs[attribute] = list(
            np.reshape(scaled_node_values, (scaled_node_values.shape[0])))

        edge_weights = []
        for edge in graph.es:
            a = graph.vs[edge.source][attribute]
            b = graph.vs[edge.target][attribute]
            edge_weights.append(max(a, b))
        graph.es[attribute] = edge_weights
        scaled_graphs.append(graph)
    return scaled_graphs
Ejemplo n.º 30
0
class QuantileTransformerImpl():
    def __init__(self,
                 n_quantiles=1000,
                 output_distribution='uniform',
                 ignore_implicit_zeros=False,
                 subsample=100000,
                 random_state=None,
                 copy=True):
        self._hyperparams = {
            'n_quantiles': n_quantiles,
            'output_distribution': output_distribution,
            'ignore_implicit_zeros': ignore_implicit_zeros,
            'subsample': subsample,
            'random_state': random_state,
            'copy': copy
        }
        self._wrapped_model = SKLModel(**self._hyperparams)

    def fit(self, X, y=None):
        if (y is not None):
            self._wrapped_model.fit(X, y)
        else:
            self._wrapped_model.fit(X)
        return self

    def transform(self, X):
        return self._wrapped_model.transform(X)