def normalize_cols(tr, val, train, test, cols): qnt = QuantileTransformer(output_distribution="normal") tr[cols] = qnt.fit_transform(tr[cols]).astype(np.float32) val[cols] = qnt.transform(val[cols]).astype(np.float32) train[cols] = qnt.fit_transform(train[cols]).astype(np.float32) test[cols] = qnt.transform(test[cols]).astype(np.float32)
def quantile_scaler(train, validate, test): ''' Accepts three dataframes and applies QuantileTransform() to convert values in each dataframe to a uniform distribution. Columns containing object data types are dropped, as strings cannot be directly scaled. Parameters (train, validate, test) = three dataframes being scaled Returns (scaler, train_scaled, validate_scaled, test_scaled) ''' # Remove columns with object data types from each dataframe train = train.select_dtypes(exclude=['object']) validate = validate.select_dtypes(exclude=['object']) test = test.select_dtypes(exclude=['object']) # Fit the scaler to the train dataframe scaler = QuantileTransformer().fit(train) # Transform the scaler onto the train, validate, and test dataframes train_scaled = pd.DataFrame(scaler.transform(train), columns=train.columns.values).set_index( [train.index.values]) validate_scaled = pd.DataFrame(scaler.transform(validate), columns=validate.columns.values).set_index( [validate.index.values]) test_scaled = pd.DataFrame(scaler.transform(test), columns=test.columns.values).set_index( [test.index.values]) return scaler, train_scaled, validate_scaled, test_scaled
def normalize(trn, val, test): """ Performs quantile normalization on the train, test and validation data. The QuantileTransformer is fitted on the train data, and transformed on test and validation data. Args: trn: train data - pandas dataframe. val: validation data - pandas dataframe. test: test data - pandas dataframe. Returns: trn_norm: normalized train data - pandas dataframe. val_norm: normalized validation - pandas dataframe. test_norm: normalized test data - pandas dataframe. """ norm_model = QuantileTransformer(n_quantiles=100, random_state=0, output_distribution="normal") trn_norm = pd.DataFrame(norm_model.fit_transform(trn), index=trn.index, columns=trn.columns) val_norm = pd.DataFrame(norm_model.transform(val), index=val.index, columns=val.columns) tst_norm = pd.DataFrame(norm_model.transform(test), index=test.index, columns=test.columns) return trn_norm, val_norm, tst_norm
def quantile_transformer(train_features, test_features, features, n_quantiles=100, output_distribution='normal'): log = logging.getLogger(f"{__name__}.{inspect.currentframe().f_code.co_name}") log.info("Start.one_experiment") train_features = train_features.copy() test_features = test_features.copy() ################################################## # RankGauss - transform to Gauss ################################################## log.debug(f"Prearation data transform.\ntrain_features.shape: {train_features.shape}") for col in tqdm(features, 'QuantileTransformer', leave=False): # kurt = max(kurtosis(train_features[col]), kurtosis(test_features[col])) # QuantileTransformer_n_quantiles = n_quantile_for_kurt(kurt, calc_QT_par_kurt(QT_n_quantile_min, QT_n_quantile_max)) # transformer = QuantileTransformer(n_quantiles=QuantileTransformer_n_quantiles,random_state=0, output_distribution="normal") transformer = QuantileTransformer(n_quantiles=n_quantiles, random_state=0, output_distribution=output_distribution) # from optimal commit 9 vec_len = len(train_features[col].values) vec_len_test = len(test_features[col].values) raw_vec = train_features[col].values.reshape(vec_len, 1) transformer.fit(raw_vec) train_features[col] = transformer.transform(raw_vec).reshape(1, vec_len)[0] test_features[col] = \ transformer.transform(test_features[col].values.reshape(vec_len_test, 1)).reshape(1, vec_len_test)[0] gc.collect() return train_features, test_features
def rankGauss(dfTrain, dfTest=None, n_quantiles=100, random_state=0): #transformer定義 transformer = QuantileTransformer(n_quantiles=n_quantiles, random_state=random_state, output_distribution="normal") #データ数, 列名 vec_len = len(dfTrain.values) clmnNmTrain = dfTrain.columns.values[0] if dfTest is not None: vec_len_test = len(dfTest.values) clmnNmTest = dfTest.columns.values[0] #fitting raw_vec = dfTrain.values.reshape(vec_len, 1) transformer.fit(raw_vec) #変換 dfTrain = transformer.transform(raw_vec).reshape(1, vec_len)[0] if dfTest is not None: raw_vec_test = dfTest.values.reshape(vec_len_test, 1) dfTest = transformer.transform(raw_vec_test).reshape(1, vec_len_test)[0] if dfTest is not None: return pd.DataFrame(dfTrain, columns=[clmnNmTrain ]), pd.DataFrame(dfTest, columns=[clmnNmTest]) else: return pd.DataFrame(dfTrain, columns=[clmnNmTrain]), None
def rank_gauss(train_features, test_features): train_features_ = train_features.copy() test_features_ = test_features.copy() GENES = [col for col in train_features_.columns if col.startswith('g-')] CELLS = [col for col in train_features_.columns if col.startswith('c-')] for col in (GENES + CELLS): transformer = QuantileTransformer(n_quantiles=100, random_state=0, output_distribution="normal") vec_len = len(train_features_[col].values) vec_len_test = len(test_features_[col].values) raw_vec = train_features_[col].values.reshape(vec_len, 1) transformer.fit(raw_vec) train_features_[col] = transformer.transform(raw_vec).reshape( 1, vec_len)[0] test_features_[col] = transformer.transform( test_features_[col].values.reshape(vec_len_test, 1)).reshape(1, vec_len_test)[0] return train_features_, test_features_
def uniform_scaler(train, validate, test): ''' Accepts three dataframes and applies a non-linear transformer to convert values in each dataframe to a standard distribution. This will distort correlations and distances within and across features.. Columns containing object data types are dropped, as strings cannot be directly scaled. Parameters (train, validate, test) = three dataframes being scaled Returns (scaler, train_scaled, validate_scaled, test_scaled) ''' train = train.select_dtypes(exclude=['object']) validate = validate.select_dtypes(exclude=['object']) test = test.select_dtypes(exclude=['object']) scaler = QuantileTransformer(n_quantiles=100, output_distribution='uniform', random_state=123, copy=True).fit(train) train_scaled = pd.DataFrame(scaler.transform(train), columns=train.columns.values).set_index( [train.index.values]) validate_scaled = pd.DataFrame(scaler.transform(validate), columns=validate.columns.values).set_index( [validate.index.values]) test_scaled = pd.DataFrame(scaler.transform(test), columns=test.columns.values).set_index( [test.index.values]) return scaler, train_scaled, validate_scaled, test_scaled
def rankGauss(train_features, test_features, runty, test_features_p=None): GENES = [col for col in train_features.columns if col.startswith('g-')] CELLS = [col for col in train_features.columns if col.startswith('c-')] for col in (GENES + CELLS): transformer = QuantileTransformer(n_quantiles=100, random_state=0, output_distribution='normal') vec_len = len(train_features[col].values) vec_len_test = len(test_features[col].values) raw_vec = train_features[col].values.reshape(vec_len, 1) transformer.fit(raw_vec) train_features[col] = transformer.transform(raw_vec).reshape( 1, vec_len)[0] test_features[col] = transformer.transform( test_features[col].values.reshape(vec_len_test, 1)).reshape(1, vec_len_test)[0] vec_len_test_p = len(test_features_p[col].values) test_features_p[col] = transformer.transform( test_features_p[col].values.reshape(vec_len_test_p, 1)).reshape(1, vec_len_test_p)[0] return train_features, test_features, test_features_p
def scale(train, test, cols, scaler='standard'): if scaler == 'uniform': scaler = QuantileTransformer(output_distribution='uniform', random_state=123, copy=True).fit(train[cols]) elif scaler == 'robust': scaler = RobustScaler(quantile_range=(25.0, 75.0), copy=True, with_centering=True, with_scaling=True).fit(train[cols]) elif scaler == 'gaussian': scaler = PowerTransformer(method='yeo-johnson', standardize=False, copy=True).fit(train[cols]) elif scaler == 'minmax': scaler = MinMaxScaler(copy=True, feature_range=(0, 1)).fit(train[cols]) elif scaler == 'standard': scaler = StandardScaler(copy=True, with_mean=True, with_std=True).fit(train[cols]) else: print("WARNING: INVALID SCALER") return None, train, test train_scaled = pd.DataFrame(scaler.transform(train[cols]), columns=cols).set_index([train.index.values]) train = train.drop(columns=cols) train = train.join(train_scaled) test_scaled = pd.DataFrame(scaler.transform(test[cols]), columns=cols).set_index([test.index.values]) test = test.drop(columns=cols) test = test.join(test_scaled) return scaler, train, test
class LinearRegression: def __init__(self, random_seed=82): self.random_seed = random_seed self.transformer_params = {'random_state': self.random_seed + 1} self.transformer = QuantileTransformer(**self.transformer_params) self.model_params = {'max_iter': 1000, 'random_state': self.random_seed + 2} self.model = None def train(self, data, label, ds=None, train_tl=200): start_time = time.time() self.fillna_values = data.mean() data.fillna(self.fillna_values, inplace=True) self.model = Lasso(**self.model_params, alpha=0.1) self.model.fit(self.transformer.fit_transform(data), label) model_train_time = time.time() - start_time try: # search if ds is not None: data['ds'] = ds cv = TimeSeriesCV(n_splits=min(6, data.shape[0] // 30)) folds = list(cv.split(data)) data.drop('ds', axis=1, inplace=True) else: cv = KFold(n_splits=3, shuffle=True, random_state=self.random_seed + 3) folds = list(cv.split(data)) n_alphas = int(min(35, (train_tl - 2 * model_train_time) / (model_train_time * len(folds)))) lasso_alpha, lasso_rmse = self._search_params(data, label, model=Lasso, search_space=np.logspace(-2, 0, n_alphas), folds=folds) Model, best_alpha = Lasso, lasso_alpha n_alphas = int(min(10, (train_tl - (time.time() - start_time) - model_train_time) / (model_train_time * 1.5 * len(folds)))) if n_alphas > 2: ridge_alpha, ridge_rmse = self._search_params(data, label, model=Ridge, search_space=np.logspace(-2, 2, n_alphas), folds=folds) if lasso_rmse * 0.99 < ridge_rmse: best_alpha = ridge_alpha Model = Ridge self.model_params.update({'alpha': best_alpha, 'random_state': self.random_seed + 4}) self.model = Model(**self.model_params) self.model.fit(self.transformer.transform(data), label) except: pass def predict(self, data): data = self.transformer.transform(data.fillna(self.fillna_values)) preds = self.model.predict(data) return preds def _search_params(self, data, label, model, search_space, folds=3, scorer=None): scorer = scorer or make_scorer(_rmse, greater_is_better=False) pipeline = Pipeline([ ('t', QuantileTransformer(**self.transformer_params)), ('m', model(**self.model_params)) ]) gs = GridSearchCV(pipeline, {'m__alpha': search_space}, scoring=scorer, cv=folds) gs.fit(data, label) return gs.best_params_['m__alpha'], gs.best_score_
def qnt_transform(train, test): cont_feats = BUREAU_CONFIG["qnt_cols"] data = pd.concat([train[cont_feats], test[cont_feats]]) scaler = QuantileTransformer(output_distribution="normal", n_quantiles=2000) scaler.fit(data) train_qnt = scaler.transform(train[cont_feats]) test_qnt = scaler.transform(test[cont_feats]) return train_qnt, test_qnt
def quantile_scaler(X_train, X_test): scaler = QuantileTransformer(n_quantiles=1000,output_distribution='normal',random_state=123,copy=True).fit(X_train) scaled_X_train = scaler.transform(X_train) scaled_X_train = pd.DataFrame(scaled_X_train, columns=X_train.columns).set_index([X_train.index]) scaled_X_test = scaler.transform(X_test) scaled_X_test = pd.DataFrame(scaled_X_test, columns=X_test.columns).set_index([X_test.index]) return scaled_X_train, scaled_X_test, scaler
def quantile_transformer(dataset, quantiles): train_set, test_set = split_train_test(dataset, percent_train) scaler = QuantileTransformer(n_quantiles = quantiles) scaler.fit(train_set) scaled_train_set = pd.DataFrame(scaler.transform(train_set), columns = colnames) scaled_test_set = pd.DataFrame(scaler.transform(test_set), columns = colnames) scaled_df = pd.concat([scaled_train_set, scaled_test_set]) X = scaled_df[predictors] Y = scaled_df[target] return X, Y, scaler
def normal_scaler(train, test, seed=123): """Quantile transformer, non_linear transformation - normal Takes in a train and test set of data, creates and fits a scaler to the train set, returns the scaler, train_scaled, test_scaled """ scaler = QuantileTransformer(n_quantiles=100, output_distribution='normal', random_state=seed, copy=True).fit(train) train_scaled = pd.DataFrame(scaler.transform(train), columns=train.columns.values).set_index([train.index.values]) test_scaled= pd.DataFrame(scaler.transform(test), columns=test.columns.values).set_index([test.index.values]) return scaler, train_scaled, test_scaled
def uniform_scaler(train, test): scaler = QuantileTransformer() scaler.fit(train) train = pd.DataFrame(scaler.transform(train), columns=train.columns.values).set_index( [train.index.values]) test = pd.DataFrame(scaler.transform(test), columns=test.columns.values).set_index( [test.index.values]) return scaler, train, test
def uniform_scaler(train, test, cols): scaler = QuantileTransformer(output_distribution='uniform', random_state=123, copy=True).fit(train[cols]) train_scaled = pd.DataFrame(scaler.transform(train[cols]), columns=cols).set_index([train.index.values]) train = train.drop(columns=cols) train = train.join(train_scaled) test_scaled = pd.DataFrame(scaler.transform(test[cols]), columns=cols).set_index([test.index.values]) test = test.drop(columns=cols) test = test.join(test_scaled) return scaler, train, test
def uniform_scaler(train, test): """Quantile transformer, non_linear transformation - uniform. Reduces the impact of outliers, smooths out unusual distributions. Takes in a train and test set of data, creates and fits a scaler to the train set, returns the scaler, train_scaled, test_scalexsd """ scaler = QuantileTransformer(n_quantiles=100, output_distribution='uniform', random_state=123, copy=True).fit(train) train_scaled = pd.DataFrame(scaler.transform(train), columns=train.columns.values).set_index([train.index.values]) test_scaled = pd.DataFrame(scaler.transform(test), columns=test.columns.values).set_index([test.index.values]) return scaler, train_scaled, test_scaled
def scale_data(train, test, feats): scaler = QuantileTransformer(output_distribution="normal", n_quantiles=2000, subsample=5e5, random_state=12345786) df_all = pd.concat([train[feats], test[feats]], axis=0) scaler.fit(df_all) qnt_feats = [f + "_qnt" for f in feats] train_qnt = pd.DataFrame(scaler.transform(train[feats]), columns=qnt_feats) test_qnt = pd.DataFrame(scaler.transform(test[feats]), columns=qnt_feats) return train_qnt, test_qnt
def uniform_scaler(train, test, seed=123): scaler = QuantileTransformer(n_quantiles=100, output_distribution='uniform', random_state=seed, copy=True).fit(train) train_scaled = pd.DataFrame(scaler.transform(train), columns=train.columns.values).set_index( [train.index.values]) test_scaled = pd.DataFrame(scaler.transform(test), columns=test.columns.values).set_index( [test.index.values]) return scaler, train_scaled, test_scaled
def quantile_transform(X_train, X_valid, X_test, columns): t = QuantileTransformer() t.fit(X_train[:, columns]) qX_train = t.transform(X_train[:, columns]) qX_valid = t.transform(X_valid[:, columns]) \ if X_valid is not None else None qX_test = t.transform(X_test[:, columns]) if X_test is not None else None if X_valid is not None: X_train[:, columns] = qX_train X_valid[:, columns] = qX_valid X_test[:, columns] = qX_test return X_train, X_valid, X_test else: return X_train
class _DistTransformer: TRANSFORMS = { 'standard', 'min-max', 'box-cox', 'yeo-johnson', 'rankgauss' } def __init__(self, transform='standard'): assert transform in self.TRANSFORMS self.t = transform def fit(self, X: pd.Series, y=None) -> None: if self.t == 'standard': self.transformer = StandardScaler() elif self.t == 'min-max': self.transformer = MinMaxScaler() elif self.t == 'box-cox': self.transformer = PowerTransformer(method='box-cox') elif self.t == 'yeo-johnson': self.transformer = PowerTransformer(method='yeo-johnson') elif self.t == 'rankgauss': self.transformer = QuantileTransformer( n_quantiles=len(X), random_state=0, output_distribution='normal') else: raise ValueError(self.transform) if isinstance(X, pd.Series): self.transformer.fit(X.values.reshape(-1, 1)) elif isinstance(X, np.ndarray): self.transformer.fit(X.reshape(-1, 1)) else: raise TypeError(type(X)) def transform(self, X: pd.Series) -> np.ndarray: if isinstance(X, pd.Series): return self.transformer.transform(X.values.reshape(-1, 1)) elif isinstance(X, np.ndarray): return self.transformer.transform(X.reshape(-1, 1)) else: raise TypeError(type(X)) def fit_transform(self, X: pd.Series) -> np.ndarray: self.fit(X) return self.transform(X) def copy(self): return copy(self)
def uniform_scaler(x_train, x_test): u_scaler_x_train = QuantileTransformer( n_quantiles=100, output_distribution='uniform', random_state=123, copy=True).fit(x_train[['monthly_charges', 'tenure']]) u_train_x_scaled = pd.DataFrame(u_scaler_x_train.transform(x_train), columns=x_train.columns.values).set_index( [x_train.index.values]) u_test_x_scaled = pd.DataFrame(u_scaler_x_train.transform(x_test), columns=x_test.columns.values).set_index( [x_test.index.values]) return u_train_x_scaled, u_test_x_scaled
def scale(self, use_quantile_transformer=False): if self.verbose: print 'Scaling features ...' if use_quantile_transformer: scaler = QuantileTransformer(n_quantiles=10, random_state=0) else: #scaler = RobustScaler() scaler = StandardScaler() columns = self.df_train.columns scaler.fit(self.df_train[columns]) self.df_train[columns] = scaler.transform(self.df_train[columns]) self.df_test[columns] = scaler.transform(self.df_test[columns])
def uniform_scaler(train_data, test_data): ''' takes in test data and train data and fits them both ''' scaler = QuantileTransformer(n_quantiles=100, output_distribution='uniform', random_state=123, copy=True).fit(train_data) test_scaled = pd.DataFrame(scaler.transform(test_data), columns=test_data.columns, index=test_data.index) train_scaled = pd.DataFrame(scaler.transform(train_data), columns=train_data.columns, index=train_data.index) return scaler, train_scaled, test_scaled
def augment_quantiled(X_train, X_valid, X_test, columns): t = QuantileTransformer() t.fit(X_train[:, columns]) qX_train = t.transform(X_train[:, columns]) qX_valid = t.transform(X_valid[:, columns]) \ if X_valid is not None else None qX_test = t.transform(X_test[:, columns]) if X_test is not None else None mX_train, mX_valid, mX_test = min_max_scale(X_train, X_valid, X_test) X_train = np.concatenate((mX_train, qX_train), axis=1) if qX_valid is None: return X_train else: X_valid = np.concatenate((mX_valid, qX_valid), axis=1) X_test = np.concatenate((mX_test, qX_test), axis=1) return X_train, X_valid, X_test
def uniform_scaler(train_data, test_data): # Creates a Uniform Scaler object and fit Train Data uniform_scaler = QuantileTransformer(n_quantiles=100, output_distribution="uniform", random_state=123, copy=True).fit(train_data) # Scale Train Data and Convert to a Data Frame scaled_train = uniform_scaler.transform(train_data) scaled_train = pd.DataFrame( scaled_train, columns=train_data.columns).set_index([train_data.index]) # Scale Train and Convert to a Data Frame scaled_test = uniform_scaler.transform(test_data) scaled_test = pd.DataFrame( scaled_test, columns=test_data.columns).set_index([test_data.index]) return scaled_train, scaled_test, uniform_scaler
class SampleWeight(object): def __init__(self, central_feat, verbose=0): assert isinstance(central_feat, pd.DataFrame) assert central_feat.shape[0] == 1, central_feat.shape self.central_feat = central_feat self.verbose = verbose self.sc = QuantileTransformer() def weight(self, x_sub, central_feat, cols): if self.verbose: for c in cols: s = x_sub[c] - central_feat[c].values[0] s /= 1. + np.mean(x_sub[c]) x_sub[c].plot() plt.axhline(central_feat[c].values[0]) plt.title(c) plt.show() diff = [ abs(x_sub[c] - central_feat[c].values[0]) / (1. + np.mean(x_sub[c])) for c in cols ] n = x_sub.shape[0] weight = np.log(2 + np.array([i for i in range(n)])) / (len(cols) * .1 + sum(diff)) return weight def scale_weight(self, x, fit=False, cols=None): assert np.all(x.columns == self.central_feat.columns) if cols is None: cols = x.columns x_sub = x[cols].copy() if fit: self.sc.fit(x_sub) x_sub_sc = self.sc.transform(x_sub) x_sub_sc = pd.DataFrame(x_sub_sc, columns=cols) central_sub = self.central_feat[cols].copy() central_feat_sc = self.sc.transform(central_sub) central_feat_sc = pd.DataFrame(central_feat_sc, columns=cols) weight = self.weight(x_sub_sc, central_feat_sc, cols) return weight
def rankGauss(train, test, col): transformer = QuantileTransformer(n_quantiles=100, random_state=0, output_distribution="normal") train[col] = transformer.fit_transform(train[col].values) test[col] = transformer.transform(test[col].values) return train, test
def scale_filtration_quantile(graphs, attribute="f"): """ Scale the filtration values of the graphs, so that they are uniformly distributed between 0 and 1. Parameters ---------- graphs: A list of graphs attribute: Attribute where the value for the filtration is stored """ values = [] for graph in graphs: values += graph.vs[attribute] values = np.reshape(values, (-1, 1)) scaler = QuantileTransformer() scaler.fit(values) scaled_graphs = [] for graph in graphs: graph = ig.Graph.copy(graph) node_values = graph.vs[attribute] node_values = np.reshape(node_values, (-1, 1)) scaled_node_values = scaler.transform(node_values) graph.vs[attribute] = list( np.reshape(scaled_node_values, (scaled_node_values.shape[0]))) edge_weights = [] for edge in graph.es: a = graph.vs[edge.source][attribute] b = graph.vs[edge.target][attribute] edge_weights.append(max(a, b)) graph.es[attribute] = edge_weights scaled_graphs.append(graph) return scaled_graphs
class QuantileTransformerImpl(): def __init__(self, n_quantiles=1000, output_distribution='uniform', ignore_implicit_zeros=False, subsample=100000, random_state=None, copy=True): self._hyperparams = { 'n_quantiles': n_quantiles, 'output_distribution': output_distribution, 'ignore_implicit_zeros': ignore_implicit_zeros, 'subsample': subsample, 'random_state': random_state, 'copy': copy } self._wrapped_model = SKLModel(**self._hyperparams) def fit(self, X, y=None): if (y is not None): self._wrapped_model.fit(X, y) else: self._wrapped_model.fit(X) return self def transform(self, X): return self._wrapped_model.transform(X)