def normalize_cols(tr, val, train, test, cols): qnt = QuantileTransformer(output_distribution="normal") tr[cols] = qnt.fit_transform(tr[cols]).astype(np.float32) val[cols] = qnt.transform(val[cols]).astype(np.float32) train[cols] = qnt.fit_transform(train[cols]).astype(np.float32) test[cols] = qnt.transform(test[cols]).astype(np.float32)
def predict_er(X, E, window=0.21, step=10, q=5, use_box_cox=True): qt = QuantileTransformer(n_quantiles=q, random_state=0) lr = HuberRegressor() lr.fit(X, E) E_pred = lr.predict(X) idx_sorted = np.argsort(E) X = X[idx_sorted] E_pred = E_pred[idx_sorted] E = E[idx_sorted] # use box-cox + quantile transformation so that the data lies uniformly in the interval [0, 1] if use_box_cox: E_quantile = qt.fit_transform(np.log(E).reshape(-1, 1)).reshape(-1) else: E_quantile = qt.fit_transform(E.reshape(-1, 1)).reshape(-1) E_pred = lr.predict(X) x, y = rolling_window_er(E_quantile, (E - E_pred) / E, window=window, step=step) if use_box_cox: x = np.exp(qt.inverse_transform(x.reshape(-1, 1)).reshape(-1)) else: x = qt.inverse_transform(x.reshape(-1, 1)).reshape(-1) return x, y
def train(): bankdata = pd.read_csv('data/trainingbin_.csv') X = bankdata.drop('class_label', axis=1) y = bankdata['class_label'] from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25) scaler = QuantileTransformer(output_distribution='uniform') X_train = scaler.fit_transform(X_train) #y_train= scaler.fit_transform(y_train) X_test = scaler.fit_transform(X_test) #y_test= scaler.fit_transform(y_test) #from sklearn.ensemble import RandomForestClassifier clf = svm.SVC(kernel='linear', C=512.0, g=0.0078125) clf.fit(X_train, y_train) y_pred = clf.predict(X_test) from sklearn.metrics import classification_report, confusion_matrix from sklearn import metrics print(confusion_matrix(y_test, y_pred)) print(confusion_matrix(y_test, y_pred)) cnf_matrix = confusion_matrix(y_test, y_pred) #print(classification_report(y_test,y_pred)) FP = cnf_matrix.sum(axis=0) - np.diag(cnf_matrix) FN = cnf_matrix.sum(axis=1) - np.diag(cnf_matrix) TP = np.diag(cnf_matrix) TN = cnf_matrix.sum() - (FP + FN + TP) FP = FP.astype(float) FN = FN.astype(float) TP = TP.astype(float) TN = TN.astype(float) # Sensitivity, hit rate, recall, or true positive rate TPR = TP / (TP + FN) # Specificity or true negative rate TNR = TN / (TN + FP) # Precision or positive predictive value PPV = TP / (TP + FP) # Negative predictive value NPV = TN / (TN + FN) # Fall out or false positive rate FPR = FP / (FP + TN) # False negative rate FNR = FN / (TP + FN) # False discovery rate FDR = FP / (TP + FP) # Overall accuracy ACC = (TP + TN) / (TP + FP + FN + TN) print("FPR:", sum(FPR) / 55) print("FNR:", sum(FNR) / 55) print("ACC:", 100 * (sum(ACC) / 55)) print(classification_report(y_test, y_pred)) print("Accuracy:", metrics.accuracy_score(y_test, y_pred))
def test_clustering(n_runs=20, alpha=0.5): nmis_both = [] nmis_attributes = [] nmis_structure = [] for i in range(n_runs): print("Run number {0}".format(i)) ensemble_density_huge('file.csv', "'\t'") dist_dense = pd.read_csv("./matrix.csv", delimiter="\t", header=None).values dist_dense = dist_dense[:, :-1] sims_attributes = ensemble_attributes("file_attributes.csv", "\t") sim_attributes = pd.read_csv("./matrix_uet.csv", delimiter="\t", header=None).values sim_attributes = sim_attributes[:, :-1] dist_attributes = sim_to_dist(np.array(sim_attributes)) dist = alpha * dist_dense + (1 - alpha) * dist_attributes dist = dist / 2 model_kmeans = KMeans(n_clusters=len(set(true))) scaler = QuantileTransformer(n_quantiles=10) dist_scaled = scaler.fit_transform(dist) dist_dense_scaled = scaler.fit_transform(dist_dense) dist_attributes_scaled = scaler.fit_transform(dist_attributes) results_dense = TSNE( metric="precomputed").fit_transform(dist_dense_scaled) results_dense_both = TSNE( metric="precomputed").fit_transform(dist_scaled) results_dense_attributes = TSNE( metric="precomputed").fit_transform(dist_attributes_scaled) labels_dense_kmeans_both = model_kmeans.fit_predict(results_dense_both) labels_dense_kmeans_attributes = model_kmeans.fit_predict( results_dense_attributes) labels_dense_kmeans_structure = model_kmeans.fit_predict(results_dense) nmis_both.append( nmi(labels_dense_kmeans_both, true, average_method="arithmetic")) nmis_attributes.append( nmi(labels_dense_kmeans_attributes, true, average_method="arithmetic")) nmis_structure.append( nmi(labels_dense_kmeans_structure, true, average_method="arithmetic")) print("Structure : {0}, {1}".format(np.mean(nmis_structure), np.std(nmis_structure))) print("Attributes : {0}, {1}".format(np.mean(nmis_attributes), np.std(nmis_attributes))) print("Both : {0}, {1}".format(np.mean(nmis_both), np.std(nmis_both))) return (nmis_structure, nmis_attributes, nmis_both)
def _test_quantile_transformer(shape, n_quantiles): from sklearn.preprocessing import QuantileTransformer st_helper = SklearnTestHelper() rng = np.random.RandomState(0) data = np.sort(rng.normal(loc=0.5, scale=0.25, size=shape), axis=0) qt = QuantileTransformer(n_quantiles=n_quantiles, random_state=0) qt.fit_transform(data) dshape = (relay.Any(), len(data[0])) _test_model_impl(st_helper, qt, dshape, data.astype("float32"))
def loadCreditCardData(label): dataframe = pd.read_csv("Data/creditcardfraud/creditcard.csv", delimiter=",", engine='python') dataframe = dataframe.dropna(axis=0, subset=[label]) dataframe = dataframe.reset_index(drop=True) quantile_scaler = QuantileTransformer(random_state=0, output_distribution='uniform') Scaled_amount = quantile_scaler.fit_transform( dataframe['Amount'].values.reshape(-1, 1)) Scaled_time = quantile_scaler.fit_transform( dataframe['Time'].values.reshape(-1, 1)) dataframe.drop(['Time', 'Amount'], axis=1, inplace=True) dataframe.insert(0, 'Amount', Scaled_amount) dataframe.insert(1, 'Time', Scaled_time) # target_col = label # other_cols = [x for x in dataframe.columns if x not in target_col] Y_true = dataframe.loc[dataframe[label] == 1] Y_false = dataframe.loc[dataframe[label] == 0] Y_false = Y_false.sample(n=20000) subdata = Y_true.append(Y_false, ignore_index=True) dataframe = subdata.sample(frac=1) dataframe = dataframe.reset_index(drop=True) print('No Frauds', round(dataframe[label].value_counts()[0]), ' of the dataset') print('Frauds', round(dataframe[label].value_counts()[1]), ' of the dataset') numeric_cols = dataframe._get_numeric_data().columns # cat_cols = [x for x in dataframe.columns if x not in numeric_cols] for col in numeric_cols: median = dataframe[col].median() dataframe[col].fillna(median, inplace=True) for col in numeric_cols: if (col == label): continue est = KBinsDiscretizer(n_bins=13, encode='ordinal', strategy='uniform') dataframe[col] = est.fit_transform(dataframe[[col]]) return dataframe
def train_neural_network(x): logits = recurrent_neural_network(x) prediction = tf.nn.softmax(logits) #prediction = recurrent_neural_network(x) cost = tf.reduce_mean( tf.nn.softmax_cross_entropy_with_logits(logits=logits, labels=y)) optimizer = tf.train.AdamOptimizer().minimize(cost) with tf.Session() as sess: sess.run(tf.initialize_all_variables()) #from sklearn.model_selection import train_test_split #epoch_x, test_x, epoch_y, test_y = train_test_split(x1, y1, test_size = 0.25) #print(epoch_x.shape) epoch_x1, epoch_y1, test_x1, test_y1 = readCSV(train_path) sc = QuantileTransformer(output_distribution='uniform') epoch_x1 = sc.fit_transform(epoch_x1) epoch_y1 = sc.fit_transform(epoch_y1) test_x1 = sc.fit_transform(test_x1) test_y1 = sc.fit_transform(test_y1) epoch_x1 = np.split(epoch_x1, 55) #print(epoch_x1) epoch_y1 = np.split(epoch_y1, 55) for epoch in range(hm_epochs): epoch_loss = 0 #epoch_y=np.split(epoch_y,20) for i, j in zip(epoch_x1, epoch_y1): e_x = i e_y = j e_x = e_x.reshape((batch_size, n_chunks, chunk_size)) #e_x = .reshape(e_x, shape=[batch_size,n_chunks,chunk_size]) _, c = sess.run([optimizer, cost], feed_dict={x: e_x, y: e_y}) epoch_loss += c print('Epoch', epoch, 'completed out of', hm_epochs, 'loss:', epoch_loss) correct = tf.equal(tf.argmax(prediction, 1), tf.argmax(y, 1)) accuracy = tf.reduce_mean(tf.cast(correct, 'float')) print( 'Accuracy:', accuracy.eval({ x: test_x1.reshape((330, n_chunks, chunk_size)), y: test_y1 })) '''pred = prediction.eval({x: test_x})
def test_transform_default_params(): N = 1000 rng = np.random.RandomState(22922) data = np.stack([ rng.lognormal(10, 5, N), rng.uniform(-10, 0, N), rng.normal(10, 10, N), rng.normal(-1, 1, N) ], axis=1) transformer = QuantileTransformer(output_distribution="normal", random_state=3434) data_transformed_sk = transformer.fit_transform(data) data_double_transformed_sk = transformer.inverse_transform( data_transformed_sk) np.testing.assert_allclose(data, data_double_transformed_sk) transformer_tf = QuantileTransformerTF(transformer) data_transformed_tf = transformer_tf.transform(data.astype(np.float64), False) data_double_transformed_tf = transformer_tf.transform( data_transformed_tf, True) with tf.Session() as session: data_transformed_tf_val, data_double_transformed_tf_val = session.run( [data_transformed_tf, data_double_transformed_tf]) np.testing.assert_allclose(data_transformed_sk, data_transformed_tf_val) np.testing.assert_allclose(data, data_double_transformed_tf_val)
def transform_spectrum(spectrum, baseline_max=None, baseline_min=None, qt=None): if spectrum.ndim == 1: spectrum = spectrum.reshape(-1, len(spectrum)) local_spectrum, _, _ = normalise_spectrum(spectrum) min_spectrum = spectrum.min(axis=1) max_spectrum = spectrum.max(axis=1) if (baseline_max is None) or (baseline_min is None): baseline_norm, baseline_max, baseline_min = normalise( min_spectrum, min_spectrum.max(), min_spectrum.min()) else: baseline_norm, baseline_max, baseline_min = normalise( min_spectrum, baseline_max, baseline_min) baseline_norm *= 2 height = max_spectrum - min_spectrum if qt is None: qt = QuantileTransformer(n_quantiles=50, random_state=0) height_norm = qt.fit_transform(height.reshape(-1, 1)) else: height_norm = qt.transform(height.reshape(-1, 1)) height_norm *= 2 height_norm += 1 t_spectrum = np.transpose(local_spectrum.T * height_norm.flatten() + baseline_norm.T) return t_spectrum, baseline_max, baseline_min, qt
def get_songs(self, songfile=None, host='35.196.88.209', user='******', password='******', database='SPOTIFY'): """As a security measure, IP must be whitelisted in Google cloud prior to getting song data""" if not songfile: conn = pymysql.connect(host='35.196.88.209', user='******', \ password='******', database='SPOTIFY') query = """ SELECT * FROM songs """ print('fetching songs from database') self.songs = pd.read_sql(query, conn) conn.close() else: print('reading songs from local file') self.songs = pd.read_csv(songfile, skiprows=[1]) self.N = self.songs.shape[0] self.songs_labeled_ = self.songs[['song_id']].copy() qt = QuantileTransformer(output_distribution='normal', random_state=self.rng) self.raw_data = qt.fit_transform(np.array(self.songs[self.keepers])) dump(qt, 'qt.pickle') print("Saved transformer to file: 'qt.pickle'")
class LinearClassification: def __init__(self, random_seed=82): self.random_seed = random_seed self.transformer_params = {'random_state': self.random_seed + 1} self.transformer = QuantileTransformer(**self.transformer_params) self.model_params = { 'penalty': 'l2', 'C': 5.0, 'class_weight': 'balanced', 'random_state': self.random_seed + 2, 'solver': 'saga', 'max_iter': 250, 'n_jobs': 4, } self.model = None def train(self, data, label): self.fillna_values = data.mean() data = self.transformer.fit_transform(data.fillna(self.fillna_values)) self.model = LogisticRegression(**self.model_params) self.model.fit(data, label) def predict(self, data): data = self.transformer.transform(data.fillna(self.fillna_values)) preds = self.model.predict_proba(data)[:, 1] return preds
def get_full_dataset(path, dtype=np.float32): df_list = [] for file_name in os.listdir(path): if file_name.endswith(".csv"): df_list.append(load_file(os.path.join(path, file_name), dtype)) data_full = pd.concat(df_list, ignore_index=True, join="inner", copy=False)[ONE_AND_TRUE_COLUMNS_ORDER] # Since we'll be doing quite some parameter search, # we'll ignore the test for now data_train, data_val, _ = split(data_full) columns_to_rescale = dll_columns + feature_columns scaler = QuantileTransformer(output_distribution="normal", n_quantiles=int(1e5), subsample=int(1e10), copy=False) # TODO(kazeev) does it copy? print("It will now print a warning, but still work. Most likely due to" " a copy of data_full made by train_test_split, but feel free to" " investigate.") data_train.loc[:, columns_to_rescale] = scaler.fit_transform( data_train.loc[:, columns_to_rescale].values).astype(dtype) data_val.loc[:, columns_to_rescale] = scaler.transform( data_val.loc[:, columns_to_rescale].values).astype(dtype) return data_train, data_val, scaler
def quantile_transform2(input_path, output_path): folder_creator(output_path, 1) for crypto in os.listdir(input_path): df = pd.read_csv(input_path + crypto, sep=",", header=0) qt = QuantileTransformer(n_quantiles=50, random_state=0, output_distribution="normal") for feature in df.columns.values: #todo aggironare con il while qua... if feature not in [ 'Date', 'Open', 'High', 'Close', 'Low', 'Adj Close' ]: stat, p = stats.normaltest(df[feature]) if p <= 0.05: print('transforming:' + feature) p = -1 n_t = 1 while p <= 0.05: qt = QuantileTransformer(n_quantiles=n_t, random_state=0, output_distribution="normal") quanrtil = qt.fit_transform(df[feature].values.reshape( -1, 1)) new_values = pd.Series(quanrtil.reshape(-1)) stat, p = stats.normaltest(new_values) if p > 0.05: df[feature] = pd.Series(new_values) print('num_quantiles:' + str(n_t)) elif (n_t < 100): n_t += 1 else: break df.to_csv(output_path + crypto, sep=",", index=False)
def quantile_transform(input_path, output_path): folder_creator(output_path, 1) for crypto in os.listdir(input_path): print(crypto) df = pd.read_csv(input_path + crypto, sep=",", header=0) for feature in df.columns.values: if feature != "Date": print('transforming:' + feature) p = -1 n_t = 1 while p <= 0.05: qt = QuantileTransformer(n_quantiles=n_t, random_state=0, output_distribution="normal") quanrtil = qt.fit_transform(df[feature].values.reshape( -1, 1)) new_values = pd.Series(quanrtil.reshape(-1)) stat, p = stats.normaltest(new_values) if p > 0.05: df[feature] = pd.Series(new_values) print('num_quantiles:' + str(n_t)) else: n_t += 1 df.to_csv(output_path + crypto, sep=",", index=False)
def predict_on_nrc_liwc(self, raw_test_data): # This method generates predictions based on NRC/LIWC data liwc_test_df = raw_test_data.get_liwc().copy() nrc_test_df = raw_test_data.get_nrc().copy() liwc_test_df.columns = [x.lower() for x in liwc_test_df.columns] nrc_test_df.columns = [x.lower() for x in nrc_test_df.columns] test_users = raw_test_data.get_profiles()['userid'] liwc_data = pd.merge(test_users, liwc_test_df, on="userid") # We make of fusion of NRC and LIWC data nrc_liwc_data = pd.merge(liwc_data, nrc_test_df, on="userid") X_test = nrc_liwc_data.drop(['userid'], axis=1) # We scale the data using sklearn's QuantileTransformer q_scaler = QuantileTransformer(100) X_scaled = q_scaler.fit_transform(X_test) prediction = self._nrc_liwc_model.predict_classes(X_scaled) print("NRC LIWC prediction: ", len(prediction)) df = pd.DataFrame() df['userid'] = nrc_liwc_data['userid'] df['gender'] = prediction return df
def my_quantile_transform(train_targets, non_train_targets): transformer = QuantileTransformer(output_distribution="uniform") train_targets[train_targets.columns] = transformer.fit_transform( train_targets.values) non_train_targets[train_targets.columns] = transformer.transform( non_train_targets.values) return train_targets, non_train_targets
def rankgauss(X_train, y, X_test, id_test): rg = QuantileTransformer(n_quantiles=100, random_state=0, output_distribution='normal') X_train.iloc[:, :] = rg.fit_transform(X_train) X_test.iloc[:, :] = rg.transform(X_test) return X_train, y, X_test, id_test
def get_and_process_boston_dataset(random_state: int = 42, normalize_y: bool = True, normalize_X: bool = True): # Load X, y = load_boston(return_X_y=True) # Split into train/test X_train, X_test, y_train, y_test = continious_stratification( X, y, random_state=random_state) # Normalize target if normalize_y: tgt_trans = QuantileTransformer(n_quantiles=300, output_distribution="normal", random_state=random_state) y_train = tgt_trans.fit_transform(y_train[:, None]) y_test = tgt_trans.transform(y_test[:, None]) else: y_train = y_train[:, None] y_test = y_test[:, None] # Normalize features if normalize_X: feature_trans = StandardScaler() X_train = feature_trans.fit_transform(X_train) X_test = feature_trans.transform(X_test) return X_train, X_test, y_train, y_test
def quantile_transformer(data): data_columns = list(data.columns) transformer = QuantileTransformer() data = transformer.fit_transform(data) data = pd.DataFrame(data, columns=data_columns) return data
def fit(self, X: list[Config[T]], y: list[Performance]) -> None: """ Uses the provided data to fit a model which is able to predict the target variables from the input. Args: X: The input configurations. y: The performance values associated with the input configurations. """ y_numpy = self.performance_transformer.fit_transform(y) # If we apply any normalization, we do so independently per dataset if self.output_normalization is not None: # Initialize the transformer if self.output_normalization == "quantile": transformer = QuantileTransformer() else: transformer = StandardScaler() # Assign indices according to datasets encoder = LabelEncoder() dataset_indices = encoder.fit_transform( [x.dataset.name() for x in X]) # Then, iterate over datasets and transform the objectives result = np.empty_like(y_numpy) for i in range(len(encoder.classes_)): mask = dataset_indices == i result[mask] = transformer.fit_transform(y_numpy[mask]) # And eventually re-assign the result y_numpy = result self._fit(X, y_numpy)
def test_transform(): N = 10000 rng = np.random.RandomState(223532) data_2 = rng.normal(0, 1, N // 4) data = np.stack([ rng.uniform(-10, 10, N), rng.lognormal(10, 5, N), np.concatenate([data_2] * 4), rng.normal(-1, 1, N) ], axis=1) transformer = QuantileTransformer(output_distribution="normal", random_state=34214) data_transformed_sk = transformer.fit_transform(data) data_double_transformed_sk = transformer.inverse_transform( data_transformed_sk) np.testing.assert_allclose(data, data_double_transformed_sk) # To test that QuantileTransformerTF picks up the right columns # we ask it only for [1, 2, 3] columns and when testing use data[:, 1:] transformer_tf = QuantileTransformerTF(transformer, [1, 2, 3], dtype=np.float64) data_transformed_tf = transformer_tf.transform( data[:, 1:].astype(np.float64), False) data_double_transformed_tf = transformer_tf.inverse_transform( data_transformed_tf) with tf.Session() as session: data_transformed_tf_val, data_double_transformed_tf_val = session.run( [data_transformed_tf, data_double_transformed_tf]) np.testing.assert_allclose(data_transformed_sk[:, 1:], data_transformed_tf_val) np.testing.assert_allclose(data[:, 1:], data_double_transformed_tf_val)
def rankGauss(train, test, col): transformer = QuantileTransformer(n_quantiles=100, random_state=0, output_distribution="normal") train[col] = transformer.fit_transform(train[col].values) test[col] = transformer.transform(test[col].values) return train, test
def normalize(trn, val, test): """ Performs quantile normalization on the train, test and validation data. The QuantileTransformer is fitted on the train data, and transformed on test and validation data. Args: trn: train data - pandas dataframe. val: validation data - pandas dataframe. test: test data - pandas dataframe. Returns: trn_norm: normalized train data - pandas dataframe. val_norm: normalized validation - pandas dataframe. test_norm: normalized test data - pandas dataframe. """ norm_model = QuantileTransformer(n_quantiles=100, random_state=0, output_distribution="normal") trn_norm = pd.DataFrame(norm_model.fit_transform(trn), index=trn.index, columns=trn.columns) val_norm = pd.DataFrame(norm_model.transform(val), index=val.index, columns=val.columns) tst_norm = pd.DataFrame(norm_model.transform(test), index=test.index, columns=test.columns) return trn_norm, val_norm, tst_norm
class LinearRegression: def __init__(self, random_seed=82): self.random_seed = random_seed self.transformer_params = {'random_state': self.random_seed + 1} self.transformer = QuantileTransformer(**self.transformer_params) self.model_params = {'max_iter': 1000, 'random_state': self.random_seed + 2} self.model = None def train(self, data, label, ds=None, train_tl=200): start_time = time.time() self.fillna_values = data.mean() data.fillna(self.fillna_values, inplace=True) self.model = Lasso(**self.model_params, alpha=0.1) self.model.fit(self.transformer.fit_transform(data), label) model_train_time = time.time() - start_time try: # search if ds is not None: data['ds'] = ds cv = TimeSeriesCV(n_splits=min(6, data.shape[0] // 30)) folds = list(cv.split(data)) data.drop('ds', axis=1, inplace=True) else: cv = KFold(n_splits=3, shuffle=True, random_state=self.random_seed + 3) folds = list(cv.split(data)) n_alphas = int(min(35, (train_tl - 2 * model_train_time) / (model_train_time * len(folds)))) lasso_alpha, lasso_rmse = self._search_params(data, label, model=Lasso, search_space=np.logspace(-2, 0, n_alphas), folds=folds) Model, best_alpha = Lasso, lasso_alpha n_alphas = int(min(10, (train_tl - (time.time() - start_time) - model_train_time) / (model_train_time * 1.5 * len(folds)))) if n_alphas > 2: ridge_alpha, ridge_rmse = self._search_params(data, label, model=Ridge, search_space=np.logspace(-2, 2, n_alphas), folds=folds) if lasso_rmse * 0.99 < ridge_rmse: best_alpha = ridge_alpha Model = Ridge self.model_params.update({'alpha': best_alpha, 'random_state': self.random_seed + 4}) self.model = Model(**self.model_params) self.model.fit(self.transformer.transform(data), label) except: pass def predict(self, data): data = self.transformer.transform(data.fillna(self.fillna_values)) preds = self.model.predict(data) return preds def _search_params(self, data, label, model, search_space, folds=3, scorer=None): scorer = scorer or make_scorer(_rmse, greater_is_better=False) pipeline = Pipeline([ ('t', QuantileTransformer(**self.transformer_params)), ('m', model(**self.model_params)) ]) gs = GridSearchCV(pipeline, {'m__alpha': search_space}, scoring=scorer, cv=folds) gs.fit(data, label) return gs.best_params_['m__alpha'], gs.best_score_
def normal_transform(df, scale_target): qt = QuantileTransformer(output_distribution="normal") if not scale_target: target = df['MedHouseVal'] df = pd.DataFrame(qt.fit_transform(df), columns=df.columns) if not scale_target: df['MedHouseVal'] = target return df
def bad_quantile_transform(train_targets, non_train_targets): transformer = QuantileTransformer(output_distribution="normal", n_quantiles=100) train_targets[train_targets.columns] = transformer.fit_transform( train_targets.values) non_train_targets[train_targets.columns] = transformer.transform( non_train_targets.values) return train_targets, non_train_targets, "i am the wrong type for an inversion result"
def _scale(self,stsc,lab,dev=True): ctrans = ColumnTransformer( [('scale_all', StandardScaler(), stsc), ('cats', OneHotEncoder(categories='auto'), lab)]) # xtsc = StandardScaler() xtsc = QuantileTransformer(output_distribution='normal', random_state=self.rand) # ytsc = StandardScaler() ytsc = QuantileTransformer(output_distribution='normal', random_state=self.rand) mmx = MinMaxScaler(feature_range=(-1,1)) mmy = MinMaxScaler(feature_range=(-1,1)) #wtsc = StandardScaler(with_mean=False) self.X_train_ft = ctrans.fit_transform(self.X_train_ft) self.X_test_ft = ctrans.transform(self.X_test_ft) self.X_train_ts = xtsc.fit_transform(self.X_train_ts) self.X_test_ts = xtsc.transform(self.X_test_ts) self.X_train_ts = mmx.fit_transform(self.X_train_ts) self.X_test_ts = mmx.transform(self.X_test_ts) if self.ts: self.x_train = self.X_train_ts self.x_test = self.X_test_ts else: self.x_train = np.concatenate([self.X_train_ft, self.X_train_ts], axis=1) self.x_test = np.concatenate([self.X_test_ft, self.X_test_ts], axis=1) # self.train_wt = wtsc.fit_transform(self.train_wt) # self.test_wt = wtsc.transform(self.test_wt) self.y_train_sc = ytsc.fit_transform(self.y_train) self.y_test_sc = ytsc.transform(self.y_test) self.y_train_sc = mmy.fit_transform(self.y_train_sc) self.y_test_sc = mmy.transform(self.y_test_sc) if self.wt: self.y_train = np.concatenate([self.y_train,self.train_wt],axis=1) self.y_test = np.concatenate([self.y_test,self.test_wt],axis=1) self.xtrans_sc = xtsc self.xtrans_mm = mmx self.ytrans_mm = mmy self.ytrans_sc = ytsc self.ftrans = ctrans
def folder_readpile(path, npset, samples, list_index): '''Importer for files stored in a folder in which we have more than one strand. It quantile normalizes the data to make it comparable between samples. -path: Location of the files. -npset: np.array to add the files in the path. -samples: list of samples selected. If not it will take everything in the file. -list_index: stores the samples id to recognize each row of the npset.''' for file in os.listdir(path): if samples is not None: if file.replace('_read.pile', "") in samples: tmp = pd.read_csv(path + file, sep='\t') if 'Unnamed: 3' in tmp.columns: tmp = tmp.drop(['Unnamed: 3'], axis=1) if 'pos' in tmp.columns: tmp = tmp.drop(['pos'], axis=1) qqnorm = QuantileTransformer(n_quantiles=1000, output_distribution='uniform', random_state=0) tmpnorm = qqnorm.fit_transform(tmp) tmp.loc[:, :] = tmpnorm tmp_np = np.reshape(tmp.as_matrix(), (1, tmp.shape[0], tmp.shape[1])) else: continue elif samples is None: tmp = pd.read_csv(path + file, sep='\t') if 'Unnamed: 3' in tmp.columns: tmp = tmp.drop(['Unnamed: 3'], axis=1) if 'pos' in tmp.columns: tmp = tmp.drop(['pos'], axis=1) qqnorm = QuantileTransformer(n_quantiles=1000, output_distribution='uniform', random_state=0) tmpnorm = qqnorm.fit_transform(tmp) tmp.loc[:, :] = tmpnorm tmp_np = np.reshape(tmp.as_matrix(), (1, tmp.shape[0], tmp.shape[1])) if npset is None: npset = tmp_np list_index.append(file.replace('_read.pile', "")) elif file.replace('_read.pile', "") in samples: npset = np.append(npset, tmp_np, axis=0) list_index.append(file.replace('_read.pile', "")) elif samples is None: npset = np.append(npset, tmp_np, axis=0) list_index.append(file.replace('_read.pile', "")) return npset, list_index
def quantile_transform_no_invert(train_targets, non_train_targets): transformer = QuantileTransformer(output_distribution="normal", n_quantiles=100) train_targets[train_targets.columns] = transformer.fit_transform( train_targets.values) non_train_targets[train_targets.columns] = transformer.transform( non_train_targets.values) return train_targets, non_train_targets
def map_2_uniform(X): '''Maps N*M data from any distribution to as close to a G a uniform distribution with values between 0 and 1''' quantile_transformer = QuantileTransformer(random_state=1993) data = [ quantile_transformer.fit_transform((X[i].reshape(1, -1))) for i in range(0, X.shape[0]) ] return np.array(data).astype('float32')