def test_basic(self, output_distribution): rs = da.random.RandomState(0) a = dpp.QuantileTransformer(output_distribution=output_distribution) b = spp.QuantileTransformer(output_distribution=output_distribution) X = rs.uniform(size=(1000, 3), chunks=50) a.fit(X) b.fit(X) assert_estimator_equal(a, b, atol=0.02) # set the quantiles, so that from here out, we're exact a.quantiles_ = b.quantiles_ assert_eq_ar(a.transform(X), b.transform(X), atol=1e-7) assert_eq_ar(X, a.inverse_transform(a.transform(X)))
def scaler(self): df = self.scaler_df scaled_df = pd.DataFrame() corr_df = pd.DataFrame() scaler_dict = {} for i, column in enumerate(df): # print(i, column) pd_series = df[column] gaussian_scaler = preprocessing.QuantileTransformer( output_distribution='normal') x = pd_series.values.reshape(-1, 1) # returns a numpy array x_scaled = gaussian_scaler.fit_transform(x) x_scaled = pd.Series(x_scaled[:, 0]) # check for values correlation with original values at decile 10 and 90 thres_list = [0.05, 0.1] x_thres = self.quantile_check(x, thres_list) df2 = pd.concat( [x_scaled, pd_series.reset_index(drop=True)], axis=1, ignore_index=True) df2.columns = ['transform', 'ori'] for tuples in x_thres: df3 = df2.loc[df2['ori'].between(tuples[0], tuples[1])] corr = df3.corr() corr_value = corr.iloc[0, 1] temp_df = pd.DataFrame([column, tuples[0], corr_value]).T corr_df = corr_df.append(temp_df, ignore_index=True) scaled_df = pd.concat((scaled_df, x_scaled), axis=1, ignore_index=True) scaler_filename = '/home/hchong/Documents/kaggle/plasticc/scaler/{column}.save'.format( column=column) joblib.dump(gaussian_scaler, scaler_filename) scaler_dict[column] = gaussian_scaler scaled_df.columns = [x for x in df.columns] corr_df.columns = ['columns', 'threshold', 'correlation'] self.scaled_df = scaled_df self.scaler_dict = scaler_dict self.scaled_corr_df = corr_df
def normalize_data(X, feature_dict, using_features, dont_show=True): """ Inputs: - X: numpy array of shape(num of patients x num of features) eg: 132 x 16. - feature_index: list of ints, corresponding to the txt files. eg: [0, 3, 41]. Outputs: - A normalized X, of same shape. Function: - Firstly, Normalize X by numpy.preprocessing. - Secondly, If -9999 is in a feature, we normalize this feature again, but without the -9999 ones. Thirdly, we paste the normalized feature back to X. e.g: - X = [1, 2, -9999] - Firstly, normalized to [0.01, 0.02, -1.5]. - Secondly, [1, 2] is normalized to [0.5, 0.6]. - Thirdly, paste back, we get [0.5, 0.6, -1.5]. This could keep the shape of X, makes indexing easier and doesn't harm the performance of kNN. """ X = X feature_dict = feature_dict features = using_features num_patients, num_features = X.shape full_data_patients = np.arange(num_patients) miss_data_patients = [] miss_data_features = [] for i in range(num_patients): for j in range(num_features): if X[i][j] < -9998: miss_data_patients.append(i) if j not in miss_data_features: miss_data_features.append(j) break full_data_patients = np.delete(full_data_patients, miss_data_patients) if not dont_show: print("\nMissing data features:") for j in miss_data_features: print(feature_dict[features[j]]) print("\nFull data patients:") print(full_data_patients) #X_normalized = preprocessing.normalize(X, norm='l2', axis=0) scaler = preprocessing.QuantileTransformer(output_distribution='uniform') X_normalized = scaler.fit_transform(X) if miss_data_features: X_miss_data_feature = X[full_data_patients, miss_data_features].reshape(-1, 1) # Miss data feature of patients with full data. X_miss_data_feature_normalized = preprocessing.normalize(X_miss_data_feature, norm='l2', axis=0).reshape(-1) X_normalized[full_data_patients, miss_data_features] = X_miss_data_feature_normalized return X_normalized
def impute_method_setup( self, random_state=DEFAULT_IMPUTER_RANDOM_STATE, add_indicator=DEFAULT_IMPUTER_ADD_INDICATOR, initial_strategy=DEFAULT_IMPUTER_INITIAL_STRATEGY, max_iter=DEFAULT_IMPUTER_MAX_ITER, estimator=DEFAULT_IMPUTER_ESTIMATOR, output_distribution=DEFAULT_TRANSFORMER_OUTPUT_DISTRIBUTION, transformer_method=DEFAULT_TRANSFORMER_METHOD, transformer_standardize=DEFAULT_TRANSFORMER_STANDARDIZE): """ Initialises the IterativeImputer, QuantileTransformer and PowerTransformer methods required if missing data is to be imputed. Parameters are passed to the sklearn routines. Where this is being done it is noted below. For further documentation on how these functions work, and what the parameters denote, please refer to the sklearn documentation. IterativeImputer: https://scikit-learn.org/stable/modules/generated/sklearn.impute.IterativeImputer.html QuantileTransformer: https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.QuantileTransformer.html PowerTransformer: https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.PowerTransformer.html Args: random_state: (int) (IterativeImputer & QuantileTransformer) seed for pseudo random number generator add_indicator: (boolean) (IterativeImputer) if True adds a `MissingIndicator` transform to the stack initial_strategy: (str) (IterativeImputer) define strategy to use for initialising missing values max_iter: (int) (IterativeImputer) maximum number of imputation rounds to perform estimator: (str) (IterativeImputer) estimator method to be used output_distribution: (str) (QuantileTransformer) Marginal distribution for the transformed data transformer_method (str) (PowerTransformer) method to use, 'box-cox' is default transformer_standardize (boolean) (PowerTransformer) select if zero-mean, unit-variance normalisation is applied, default is True Returns: None """ # set the imputer options (if we are using them) self.imputer = IterativeImputer(random_state=random_state, add_indicator=add_indicator, initial_strategy=initial_strategy, max_iter=max_iter, verbose=self.verbose, estimator=estimator) # set the power transform options self.transformer_quantile = preprocessing.QuantileTransformer( output_distribution=output_distribution, random_state=random_state) # set the power transform options self.transformer_power = preprocessing.PowerTransformer( method=transformer_method, standardize=transformer_standardize)
def feature_scale(feature_scaler, X_train, y_train): # more information about these scalers can be found @ # https://scikit-learn.org/stable/modules/preprocessing.html if feature_scaler == 'binarizer': # scale the X values in the set model = preprocessing.Binarizer() elif feature_scaler == 'one_hot_encoder': ''' >>> enc.transform([['female', 'from US', 'uses Safari'], ['male', 'from Europe', 'uses Safari']]).toarray() array([[1., 0., 0., 1., 0., 1.], [0., 1., 1., 0., 0., 1.]]) ''' # This is on y values model = preprocessing.OneHotEncoder(handle_unknown='ignore') elif feature_scaler == 'maxabs': model = preprocessing.MaxAbsScaler() elif feature_scaler == 'minmax': model = preprocessing.MinMaxScaler() elif feature_scaler == 'normalize': # L2 normalization model = preprocessing.Normalizer() elif feature_scaler == 'poly': # scale the X values in the set model = PolynomialFeatures(2) elif feature_scaler == 'power_transformer': # scale the X values in the set model = preprocessing.PowerTransformer(method='yeo-johnson') elif feature_scaler == 'quantile_transformer_normal': # scale the X values in the set model = preprocessing.QuantileTransformer(output_distribution='normal') elif feature_scaler == 'robust': model = preprocessing.RobustScaler(quantile_range=(25, 75)) elif feature_scaler == 'standard_scaler': # scale the X values in the set model = preprocessing.StandardScaler() return model
def train_validate(df): print("[INFO] preparing X_train / y_train...") df = df.sample(frac=0.7, replace=True) id = pd.DataFrame(data = df, columns=["id", "molecule_name"]) y = pd.DataFrame(data = df, columns=["scalar_coupling_constant"]) # Split the 'features' and 'income' data into training and testing sets X_train, X_val, y_train, y_val = train_test_split(df.drop(['id', 'molecule_name', 'scalar_coupling_constant'], axis=1), y, test_size = 0.20) normalization = input("Which type of normalization do you want? (standardScalar, minMax, quartile, normal with l1, normal with l2, )... ") print("[INFO] Preparing normalization...") if normalization == "standardScalar": scaler = preprocessing.StandardScaler(copy=True, with_mean=True, with_std=True).fit(X_train) elif normalization == "minMax": min_max_scaler = preprocessing.MinMaxScaler() X_train = min_max_scaler.fit_transform(X_train) elif normalization == "quartile": quantile_transformer = preprocessing.QuantileTransformer(random_state=0) X_train = quantile_transformer.fit_transform(X_train) elif normalization == "normal with l1": norm = 'l1' X_train = preprocessing.normalize(X_train, norm=norm) else: norm = 'l2' X_train = preprocessing.normalize(X_train, norm=norm) print("Datasets: Prepared") print("Training sets have shape {} and {}.".format(X_train.shape, y_train.shape)) print("Validation sets have shape {} and {}.".format(X_val.shape, y_val.shape)) print("[INFO] saving data...") np.save(os.path.join(DATA_DIR,'X_train.npy'), X_train) np.save(os.path.join(DATA_DIR,'X_val.npy'), X_val) np.save(os.path.join(DATA_DIR,'y_train.npy'), y_train) np.save(os.path.join(DATA_DIR,'y_val.npy'), y_val) print("[INFO] data saved as numpy arrays...") print("[INFO] completed...")
def soft_voting_1(df_res, y): print('\n') print('SOFT VOTING') # min_max_scaler = preprocessing.MinMaxScaler() # df_res = min_max_scaler.fit_transform(df_res) # robust_scaler = preprocessing.RobustScaler() # df_res = robust_scaler.fit_transform(df_res) quantile = preprocessing.QuantileTransformer() df_res = quantile.fit_transform(df_res) clf1 = ensemble.AdaBoostClassifier() clf2 = MLPClassifier( ) #AdaBoostClassifier()#ensemble.RandomForestClassifier(n_estimators=200, random_state=11,n_jobs=-1) clf3 = ensemble.GradientBoostingClassifier( ) #ensemble.GradientBoostingClassifier(n_estimators=3000, learning_rate=1.1, max_depth=5, random_state=11) clf4 = SGDClassifier( loss='log', max_iter=1000 ) #SGDClassifier(max_iter=35000, tol=1e-4, shuffle=True, penalty='l2', loss='log') clf5 = LogisticRegression() clf6 = LogisticRegressionCV() clf7 = QuadraticDiscriminantAnalysis() clf8 = GaussianNB() clf9 = KNeighborsClassifier(3) clf10 = SVC(probability=True) eclf = VotingClassifier(estimators=[('ada', clf1), ('mlpc', clf2), ('gbs', clf3), ('sgdc', clf4), ('lgr', clf5), ('lrcv', clf6), ('qda', clf7), ('gnb', clf8), ('knn', clf9), ('cvc', clf10)], voting='soft', weights=[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]) for clf, label in zip( [clf1, clf2, clf3, clf4, clf5, clf6, clf7, clf8, clf9, clf10, eclf], [ 'AdaBoostClassifier', 'MLPClassifier', 'GradientBoosting', 'SGDClassifier', 'LogisticRegression', 'LogisticRegressionCV', 'QuadraticDiscriminantAnalysis', 'GaussianNB', 'KNeighborsClassifier', 'SVC', 'Ensemble' ]): scores = cross_val_score(clf, df_res, y, cv=5, scoring='roc_auc') print("ROC_AUC scoring: %0.5f (+/- %0.5f) [%s]" % (scores.mean(), scores.std(), label)) return eclf
def norm_data(self, data, algorithm='norm'): #数据规范化,dt:2d dt = np.array(list(data)) shape = np.shape(dt) _custom = True if algorithm[0:2] == 'u-' else False if _custom == False and len(shape) <= 1: raise ValueError('query 2d data') if algorithm == 'norm': scaler = preprocessing.normalize elif algorithm == 'max-min': # 最大最小归一化 scaler = preprocessing.MinMaxScaler() elif algorithm == 'qt': scaler = preprocessing.QuantileTransformer() elif algorithm == 'max': # 最大绝对值归一化 scaler = preprocessing.MaxAbsScaler() elif algorithm == 'stand': # 减均值,比方差 scaler = preprocessing.StandardScaler() elif algorithm == 'u-max': # 自定义,最大规范化,规范到-1~1之间,这里应该改为求dt中每个值的绝对值的最大值 _max = abs(np.max(dt)) _res = dt / _max elif algorithm == 'u-max-min': # 自定义最大最小归一化,规范到0~1之间 _max = np.max(dt) _min = np.min(dt) _res = (dt - _min) / (_max - _min) elif algorithm == 'u-stand': # 自定义中心标准化,适合数据稳定,变化不大的情况。 _mean = np.mean(dt) _var = np.std(dt) _res = (dt - _mean) / _var elif algorithm == 'decimal': # 自定义小数规范化,规范到0~1之间 _q = np.log10(dt.max()) _res = dt / np.power(10, _q) if _custom == True: return _res else: take_data = scaler.fit_transform(dt) return take_data
def scaleFeatures(data, opt='standard', **kwargs): from sklearn import preprocessing if opt == 'standard': scl = preprocessing.StandardScaler(**kwargs) elif opt == 'robust': scl = preprocessing.RobustScaler(**kwargs) elif opt == 'minmax': scl = preprocessing.MinMaxScaler(**kwargs) elif opt == 'norm': scl = preprocessing.Normalizer(**kwargs) elif opt == 'gaussian': # doesn't work! no idea why scl = preprocessing.PowerTransformer(method='yeo-johnson') elif opt == 'quantile': scl = preprocessing.QuantileTransformer(output_distribution='normal') out = pd.DataFrame(scl.fit_transform(data), columns=data.columns) print("Features scaled using", opt, "scaling method!") return out
def tuned_gradboost(loadWeights): pipe = Pipeline([ ('std', preprocessing.QuantileTransformer()), ('gbc', GradientBoostingClassifier()) # ExtraTreesClassifier()) ]) param_grid = [{ # 'gbc__criterion' : ["gini"],#gini is good 'gbc__n_estimators': [100, 200, 250], # 1000 'gbc__learning_rate': [0.1, 0.05, 0.01], #'gbc__max_depth': [2, 4, 8, 16], 'gbc__min_samples_leaf': [1, 10], # 100 and 200 is bad 'gbc__min_samples_split': [10, 100, 400], 'gbc__max_features': ["auto", 10, 7, 1] # 0.5,0.1 }] #gsGBC = GridSearchCV(pipe,param_grid = param_grid, cv=3, scoring="accuracy", n_jobs= -1, verbose = 2) #gsGBC.fit(dataSet.X_train, dataSet.y_train) model_tuner = None if not loadWeights or not os.path.exists('weights/' + sys._getframe().f_code.co_name + '.pkl'): loadWeights = False model_tuner = GridSearchCV( pipe, param_grid, cv=2, n_jobs=-1, verbose=2, return_train_score=True) #cv=dataSet.ten_fold_cv model_tuner.fit(dataSet.X_train, dataSet.y_train) else: model_tuner = joblib.load('weights/' + sys._getframe().f_code.co_name + '.pkl') report_summary(model_tuner) if not loadWeights: joblib.dump(model_tuner, 'weights/' + sys._getframe().f_code.co_name + '.pkl', compress=1) results = model_tuner.cv_results_ return 0
def get_gaussian_data(self, x_train, x_test): """ Cette méthode va venir modifier nos données brute grâce à la méthode Gaussian de préprocessing de scikit learn Args: x_train (np.array): donn�es d'entrainement devant �tre transform�es x_test (np.array): donn�es de test devant �tre transform�es Returns: x_train_gauss (np.array): donn�es d'entrainement transform�es x_test_gauss (np.array): donn�es de test transform�es """ quantile_transformer2 = preprocessing.QuantileTransformer(output_distribution='normal', random_state=17) x_train_gauss = quantile_transformer2.fit_transform(x_train) x_test_gauss = quantile_transformer2.fit_transform(x_test) return x_train_gauss, x_test_gauss
def get_quantile_data(self, x_train, x_test): """ Cette méthode va venir modifier nos données brute grâce à la méthode quantile de préprocessing de scikit learn Args: x_train (np.array): donn�es d'entrainement devant �tre transform�es x_test (np.array): donn�es de test devant �tre transform�es Returns: x_train_trans (np.array): donn�es d'entrainement transform�es x_test_trans (np.array): donn�es de test transform�es """ quantile_transformer = preprocessing.QuantileTransformer(random_state=17) x_train_trans = quantile_transformer.fit_transform(x_train) x_test_trans = quantile_transformer.fit_transform(x_test) return x_train_trans, x_test_trans
def train_drfs(train_x, train_y, eps=0.5, threshold="median"): n_samples, n_features, n_classes = \ get_counts_tt(train_x, train_y) # pick number of components min_comp = random_projection.johnson_lindenstrauss_min_dim( \ n_samples=n_samples, eps=eps) min_comp = min(min_comp, n_features) # scale and agglomerate to min_comp #scaler = preprocessing.StandardScaler() scaler = preprocessing.QuantileTransformer() feat_agg = cluster.FeatureAgglomeration( \ n_clusters=min_comp) xtc = ensemble.ExtraTreesClassifier(n_estimators=100, n_jobs=-1) scaler2 = preprocessing.RobustScaler() #poly = preprocessing.PolynomialFeatures(degree=2, interaction_only=True) # train the model pipeline dr_pipe = pipeline.Pipeline([('scaler', scaler), \ ('feat_agg', feat_agg), ('scaler2', scaler2)]) dr_pipe.fit(train_x) # transform train_x to train xtc train_x = dr_pipe.transform(train_x) # train the xtc xtc.fit(train_x, train_y) print("Feature importances:") print("\tMax:", max(xtc.feature_importances_)) print("\tMin:", min(xtc.feature_importances_)) #print(xtc.feature_importances_) # create the feature selection model from the xtc feat_sel = feature_selection.SelectFromModel( \ xtc, prefit=True, threshold=threshold) # create the pipeline to reduce dim then feature select drfs_pipe = pipeline.Pipeline(\ [('dr_pipe', dr_pipe), ('feat_sel', feat_sel)]) return drfs_pipe
def scaler(self, method='yeo-johnson'): ''' Scale data to gaussian distribution N(0,1) Parameters ---------- column_name : string Name of the column to scale data. method : string, optional Method to use for scaling transformation. The default is 'yeo-johnson'. Returns ------- dataframe : DataFrame Return updated dataframe of the missing data from the column. scaler: object scaler created with the data. ''' if method == 'standard': scaler = preprocessing.StandardScaler() if method == 'minmax': scaler = preprocessing.MinMaxScaler() if method == 'maxabs': scaler = preprocessing.MaxAbsScaler() if method == 'robust': scaler = preprocessing.RobustScaler() if method == 'quantile': scaler = preprocessing.QuantileTransformer( output_distribution='normal') if method == 'l1': scaler = preprocessing.normalize(method) if method == 'l2': scaler = preprocessing.normalize(method) if method == 'max': scaler = preprocessing.normalize(method) feature_sign = self._check_sign_feature() if method == 'box-cox' or feature_sign == 'positive': scaler = preprocessing.PowerTransformer(method) if method == 'yeo-johnson' or feature_sign == 'negative': scaler = preprocessing.PowerTransformer(method) scaler.fit(self.dataframe) self.dataframe = scaler.transform(self.dataframe) return self.dataframe, scaler
def fit_scaler(sample, n_dims, scaler_out, reshape=True, scaler_type='QuantileTransformer'): print('Fitting quantile transform', end=' ', flush=True) start_time = time.time() if reshape: sample = np.reshape(sample, (-1, n_dims)) if scaler_type == 'QuantileTransformer': scaler = preprocessing.QuantileTransformer( output_distribution='uniform', n_quantiles=10000, random_state=0) if scaler_type == 'MaxAbsScaler': scaler = preprocessing.MaxAbsScaler() if scaler_type == 'RobustScaler': scaler = preprocessing.RobustScaler() scaler.fit(sample) print('(', '\b' + format(time.time() - start_time, '2.1f'), '\b' + ' s)') print('Saving scaling to', scaler_out) pickle.dump(scaler, open(scaler_out, 'wb')) return scaler
def hard_voting(df_res, y): print('HARD VOTING') quantile = preprocessing.QuantileTransformer() df_res = quantile.fit_transform(df_res) clf1 = ensemble.AdaBoostClassifier() clf2 = MLPClassifier( ) #AdaBoostClassifier()#ensemble.RandomForestClassifier(n_estimators=200, random_state=11,n_jobs=-1) clf3 = ensemble.GradientBoostingClassifier( ) #ensemble.GradientBoostingClassifier(n_estimators=3000, learning_rate=1.1, max_depth=5, random_state=11) clf4 = SGDClassifier( loss='log', max_iter=1000 ) #SGDClassifier(max_iter=35000, tol=1e-4, shuffle=True, penalty='l2', loss='log') clf5 = LogisticRegression() clf6 = LogisticRegressionCV() clf7 = QuadraticDiscriminantAnalysis() clf8 = GaussianNB() clf9 = LinearDiscriminantAnalysis() eclf = VotingClassifier(estimators=[('ada', clf1), ('mlpc', clf2), ('gbs', clf3), ('sgdc', clf4), ('lgr', clf5), ('lrcv', clf6), ('qda', clf7), ('gnb', clf8), ('lda', clf9)], voting='hard') # eclf = VotingClassifier(estimators=[('ada', clf1), ('mlpc', clf2), # ('gbs', clf3), ('sgdc', clf4), ('lgr', clf5), ('lrcv', clf6)], voting='hard') for clf, label in zip( [clf1, clf2, clf3, clf4, clf5, clf6, clf7, clf8, clf9, eclf], [ 'AdaBoostClassifier', 'MLPClassifier', 'GradientBoosting', 'SGDClassifier', 'LogisticRegression', 'LogisticRegressionCV', 'QuadraticDiscriminantAnalysis', 'GaussianNB', 'LinearDiscriminantAnalysis', 'Ensemble' ]): scores = cross_val_score(clf, df_res, y, cv=5, scoring='roc_auc') #, scoring='roc_auc' print("ROC_AUC scoring: %0.5f (+/- %0.5f) [%s]" % (scores.mean(), scores.std(), label))
def transformation(X, method=1, powerMet='yeo-johnson'): """Power transformation Args: X (float): Input data method (int, optional): 0:QuantileTransformer,1:Power Transformer. Defaults to 1. powerMet (str, optional): It is essential for method 1. Not included in method 0. Defaults to 'yeo-johnson'. Returns: [float]: X_tr transformed version of X. [transformer]: transformer with the estimated values from the input X """ if method == 0: scaler = preprocessing.QuantileTransformer(random_state=0) X_tr = scaler.fit_transform(X) elif method == 1: scaler = preprocessing.PowerTransformer(method=powerMet, standardize=False) X_tr = scaler.fit_transform(X) else: X_tr = X return X_tr, scaler
def applyScale(self): if self.optionScale == 0: pass elif self.optionScale == 1:#quick scale self.dataSet = preprocessing.scale(self.dataSet) elif self.optionScale == 2:#standar scale scaler = preprocessing.StandardScaler().fit(self.dataSet) self.dataSet = scaler.transform(self.dataSet) elif self.optionScale == 3:#min max scaler min_max_scaler = preprocessing.MinMaxScaler() self.dataSet = min_max_scaler.fit_transform(self.dataSet) elif self.optionScale == 4:#quantile transformation quantile_transformer = preprocessing.QuantileTransformer(random_state=0) self.dataSet = quantile_transformer.fit_transform(self.dataSet) else:#powerTransformation pt = preprocessing.PowerTransformer(method='box-cox', standardize=False) self.dataSet = pt.fit_transform(self.dataSet)
def process_data(self, data): if type(self.features) == list: self.return_features = [x + "_normalized" for x in self.features] for feature in self.features: return_feature = feature + "_normalized" if feature == "peak_width": peak_width_min = data[feature].min() peak_width_max = data[feature].max() data[return_feature] = (data[[feature]].values - peak_width_min) / (peak_width_max - peak_width_min) elif "percentage" in feature: data[return_feature] = data[[feature]].values elif ('motif' in feature) and (self.method == "knn"): data[return_feature] = data[[feature]].values / (50**0.5) else: data[return_feature] = preprocessing.QuantileTransformer( ).fit_transform(data[[feature]].values) else: print("feature must be list type") q() return data
def soft_voting(df_res): # min_max_scaler = preprocessing.MinMaxScaler() # df_res = min_max_scaler.fit_transform(df_res) # robust_scaler = preprocessing.RobustScaler() # df_res = robust_scaler.fit_transform(df_res) quantile = preprocessing.QuantileTransformer() df_res = quantile.fit_transform(df_res) clf1 = ensemble.AdaBoostClassifier() clf2 = MLPClassifier( ) #AdaBoostClassifier()#ensemble.RandomForestClassifier(n_estimators=200, random_state=11,n_jobs=-1) clf3 = ensemble.GradientBoostingClassifier( ) #ensemble.GradientBoostingClassifier(n_estimators=3000, learning_rate=1.1, max_depth=5, random_state=11) clf4 = SGDClassifier( loss='log', max_iter=1000 ) #SGDClassifier(max_iter=35000, tol=1e-4, shuffle=True, penalty='l2', loss='log') clf5 = LogisticRegression() clf6 = LogisticRegressionCV() eclf = VotingClassifier(estimators=[('ada', clf1), ('mlpc', clf2), ('gbs', clf3), ('sgdc', clf4), ('lgr', clf5), ('lrcv', clf6)], voting='soft', weights=[1, 1, 1, 1, 1, 1]) # eclf = VotingClassifier(estimators=[('ada', clf1), ('mlpc', clf2), # ('gbs', clf3), ('sgdc', clf4), ('lgr', clf5), ('lrcv', clf6)], voting='hard') for clf, label in zip([clf1, clf2, clf3, clf4, clf5, clf6, eclf], [ 'AdaBoostClassifier', 'MLPClassifier', 'GradientBoosting', 'SGDClassifier', 'LogisticRegression', 'LogisticRegressionCV', 'Ensemble' ]): scores = cross_val_score(clf, df_res, y, cv=5) #, scoring='roc_auc' print("ROC_AUC scoring: %0.5f (+/- %0.5f) [%s]" % (scores.mean(), scores.std(), label))
def preprocess(x_train: numpy.ndarray, y_train: numpy.ndarray, x_test: numpy.ndarray) \ -> Tuple[numpy.ndarray, numpy.ndarray, numpy.ndarray, list]: """ Preprocess the data: Symmetrize x_train and y_train. Scale x_train and x_test using Quantile Transformer. Apply PCA keeping all the information. :param x_train: the train features. :param y_train: the train labels. :param x_test: the test features. :return: preprocessed x_train, y_train, x_test and pca.components_ used. """ logger.log('Preprocessing...') logger.log('\tSymmetrize training dataset...') x_train, y_train = helpers.preprocessing.symmetrize_dataset( x_train, y_train) logger.log('\t' + str(len(y_train)) + ' training data remained') logger.log('\tScaling data using Quantile Transformer with params:') scaler = preprocessing.QuantileTransformer(random_state=0) logger.log('\t' + str(scaler.get_params())) scaler.fit(x_train) x_train = scaler.transform(x_train) x_test = scaler.transform(x_test) logger.log('\tApplying Principal Component Analysis with params:') # Keep all the information. pca = PCA(whiten=True, random_state=0) logger.log('\t' + str(pca.get_params())) pca.fit(x_train) x_train = pca.transform(x_train) x_test = pca.transform(x_test) return x_train, y_train, x_test, pca.components_
def normalize_data(train_data, test_data,validation_1,validation_2, type=None): scaler = None if type == 'standard': scaler = preprocessing.StandardScaler() elif type == 'min_max': scaler = preprocessing.MinMaxScaler() elif type == 'l1': scaler = preprocessing.Normalizer('l1') elif type == 'l2': scaler = preprocessing.Normalizer('l2') elif type == 'quantile_normal': scaler = preprocessing.QuantileTransformer(output_distribution='normal') if scaler is not None: scaler.fit(train_data) train_data = scaler.transform(train_data) test_data = scaler.transform(test_data) validation_1 = scaler.transform(validation_1) validation_2 = scaler.transform(validation_2) else: print_out(f,"Invalid scaling method - no scaling has been done") return train_data, test_data, validation_1, validation_2
def standardization(X_train, X_test=[], mode='zscore', scaler=None): new_X_test = [] if scaler is None: if mode == 'zscore': scaler = preprocessing.StandardScaler().fit(X_train) new_X_train = scaler.transform(X_train) if X_test != []: new_X_test = scaler.transform(X_test) elif mode == 'minmax': scaler = preprocessing.MinMaxScaler().fit(X_train) new_X_train = scaler.transform(X_train) if X_test != []: new_X_test = scaler.transform(X_test) elif mode == 'quantile': scaler = preprocessing.QuantileTransformer( output_distribution='normal').fit(X_train) new_X_train = scaler.transform(X_train) if X_test != []: new_X_test = scaler.transform(X_test) elif mode == 'normalize': new_X_train = preprocessing.normalize(X_train) if X_test != []: new_X_test = scaler.transform(X_test) scaler = None else: print("'mode = %s' is not defined" % (mode)) new_X_train = X_train new_X_test = X_test scaler = None else: new_X_train = scaler.transform(X_train) if X_test != []: new_X_test = scaler.transform(X_test) new_X_train = new_X_train.astype(np.float32) return new_X_train, new_X_test, scaler
def test_transform_ct_1(self): """ Unit test for apply_preprocessing on ColumnTransformer with drop option and sklearn encoder. """ y = pd.DataFrame(data=[0, 1], columns=['y']) train = pd.DataFrame({ 'num1': [0, 1], 'num2': [0, 2], 'other': ['A', 'B'] }) enc = ColumnTransformer(transformers=[ ('power', skp.QuantileTransformer(n_quantiles=2), ['num1', 'num2']) ], remainder='drop') enc.fit(train, y) train_preprocessed = pd.DataFrame(enc.transform(train)) clf = cb.CatBoostClassifier(n_estimators=1).fit(train_preprocessed, y) test = pd.DataFrame({ 'num1': [0, 1, 1], 'num2': [0, 2, 3], 'other': ['A', 'B', 'C'] }) expected = pd.DataFrame(enc.transform(test)) result = apply_preprocessing(test, clf, enc) assert result.shape == expected.shape assert [column in clf.feature_names_ for column in result.columns] assert all(expected.index == result.index) assert all([ str(type_result) == str(expected.dtypes[index]) for index, type_result in enumerate(result.dtypes) ])
def feature_selection(df_res, y, file_number): feature_names = df_res.columns # min_max_scaler = preprocessing.MinMaxScaler() # df_res = min_max_scaler.fit_transform(df_res) quantile = preprocessing.QuantileTransformer() df_res = quantile.fit_transform(df_res) X_res_train, X_res_test, y_res_train, y_res_test = train_test_split( df_res, y, test_size=0.25, random_state=11, shuffle=True) gbc = ensemble.GradientBoostingClassifier( ) #n_estimators=100, random_state=11, n_jobs=-1 gbc.fit(X_res_train, y_res_train) err_train = np.mean(y_res_train != gbc.predict(X_res_train)) err_test = np.mean(y_res_test != gbc.predict(X_res_test)) print(err_train, err_test) scores = cross_val_score(gbc, df_res, y, cv=5, scoring='roc_auc') # print(scores) print("ROC_AUC GradientBoostingClassifier: %0.2f (+/- %0.5f)" % (scores.mean(), scores.std())) # feature_names = df_res.columns importances = gbc.feature_importances_ indices = np.argsort(importances)[::-1] low_cost_features = list() with open('res_' + str(file_number) + '.txt', 'w') as f: with redirect_stdout(f): # print("Feature importances:") for f, idx in enumerate(indices): # print("{:2d}. feature '{:5s}' ({:.12f})".format(f + 1, feature_names[idx], importances[idx])) if importances[idx] == 0: low_cost_features.append(feature_names[idx]) print('Кол-во пустых фич: ', len(low_cost_features)) return low_cost_features
def QuantileTransformer(train_df, test_df, HP): n_quantiles, output_distribution, ignore_implicit_zeros, subsample, copy = HP['QuantileTransformer']['n_quantiles'], \ HP['QuantileTransformer'][ 'output_distribution'], \ HP['QuantileTransformer'][ 'ignore_implicit_zeros'], \ HP['QuantileTransformer']['subsample'], \ HP['QuantileTransformer']['copy'] train_x = train_df.iloc[:, :-1] train_y = train_df.iloc[:, -1:] test_x = test_df.iloc[:, :-1] test_y = test_df.iloc[:, -1:] transformer = preprocessing.QuantileTransformer( n_quantiles=n_quantiles, output_distribution=output_distribution, ignore_implicit_zeros=ignore_implicit_zeros, copy=copy) train_x_copy = train_x.copy() train_x_transformed = transformer.fit_transform(train_x_copy) test_x_copy = test_x.copy() test_x_transformed = transformer.transform(test_x_copy) # TODO check here train_column_name = list(train_x_copy.columns) test_column_name = list(test_x_copy.columns) train_x_transformed_df = pd.DataFrame(train_x_transformed) train_x_transformed_df.columns = train_column_name train_df_transformed = train_x_transformed_df.assign(label=train_y.values) test_x_transformed_df = pd.DataFrame(test_x_transformed) test_x_transformed_df.columns = test_column_name test_df_transformed = test_x_transformed_df.assign(label=test_y.values) return train_df_transformed, test_df_transformed
def quantile_norm(df): """ Wrapper for sklearn's preprocessing.QuantileTransformer. Parameters ---------- df : DataFrame Returns ------- DataFrame QuantileTransformer. transformed Notes ----- Outer bounds are very low probability regions of the normal distribution so min and max are approximately -5 and +5 standard deviations away from mean, which limits the utility of this transform. """ assert (isinstance(df, pd.DataFrame)) qt = preprocessing.QuantileTransformer(output_distribution='normal', random_state=0) return pd.DataFrame(qt.fit_transform(df))
grid = GridSearchCV(pipe, param_grid=param_grid, cv=3, n_jobs=1, verbose=2) grid.fit(X_train, y_train) print(grid.score(X_test, y_test)) print(grid.best_estimator_.score(X_test, y_test)) # same result print(grid.best_params_) # https://iaml.it/blog/optimizing-sklearn-pipelines from sklearn import feature_selection from sklearn.linear_model import Ridge n_features_to_test = np.arange(1, 3) alpha_to_test = 2.0**np.arange(-6, +6) scalers_to_test = [ preprocessing.StandardScaler(), preprocessing.RobustScaler(), preprocessing.QuantileTransformer() ] params = [ {'scaler': scalers_to_test, 'reduce_dim': [decomposition.PCA()], # Parameter of the parameter 'reduce_dim__n_components': n_features_to_test, 'regressor__alpha': alpha_to_test}, # Parameter of the parameter {'scaler': scalers_to_test, 'reduce_dim': [feature_selection.SelectKBest(feature_selection.f_regression)], 'reduce_dim__k': n_features_to_test,\ 'regressor__alpha': alpha_to_test} ] pipe = Pipeline([('scaler', preprocessing.StandardScaler()),
def preparation(self, trips, territorial, users, changes=0, binary=0): if changes: trips = pd.read_excel(trips, index_col=0) users = pd.read_excel(users, index_col=4) users = users.drop(columns=['Unnamed: 0']) terr = pd.read_excel(territorial, index_col=1) terr = terr.drop(columns=['Unnamed: 0']) terr = terr.fillna(terr.mean()) df = [] test = [] modes = trips['mode'].unique() modes = dict({(modes[i], i) for i in range(len(modes))}) for row, col in trips.iterrows(): user_id = col['user_id'] tmp_obj = {} d_census_id = col['d_census_id'] try: territorial_info = terr.loc[int(d_census_id)] user_info = users.loc[user_id] # for i in self.territorial_features: for i in terr.columns: tmp_obj[i] = territorial_info[i] for i in self.trip_features: tmp_obj[i] = col[i] for i in user_info.index: if i == 'Row': continue tmp_obj[i] = user_info[i] format_date = "%Y-%m-%d %H:%M:%S" o_d = datetime.datetime.strptime( tmp_obj['o_datetime'], format_date) + datetime.timedelta(hours=1) d_d = datetime.datetime.strptime( tmp_obj['d_datetime'], format_date) + datetime.timedelta(hours=1) tmp_obj['o_datetime'] = self.date_to_cat(o_d) tmp_obj['d_datetime'] = self.date_to_cat(d_d) tmp_obj['mode'] = modes[tmp_obj['mode']] if tmp_obj['category'] == 'helth': tmp_obj['category'] = 'health' elif tmp_obj['category'] == 'admni_chores': tmp_obj['category'] = 'admin_chores' tmp_obj['category_label'] = tmp_obj['category'] tmp_obj['category'] = self.categories[tmp_obj['category']] tmp_obj['activity_time'] = self.activity_to_cat( tmp_obj['activity_time']) tmp_obj['occupation'] = self.occup[tmp_obj['occupation']] if tmp_obj['category_label'] == 'nan' or tmp_obj[ 'category_label'] == 'NONE': test.append(tmp_obj) else: df.append(tmp_obj) except: # pass traceback.print_exc() self.df = pd.DataFrame(df) self.test = pd.DataFrame(test) self.df.to_excel('data/CompleteDataframe_AllTerritorial.xlsx', index=False) self.test.to_excel('data/ToLabel_AllTerritorial.xlsx', index=False) else: self.df = pd.read_excel('data/df.xlsx') self.to_label = pd.read_excel('data/ToLabel_AllTerritorial.xlsx') print('Dataset Loaded') self.df = self.df.sample(frac=1) lb = [ 'eating', 'entertainment', 'shopping', 'commuting', 'recreation', 'health', 'travel', 'home', 'work', 'education', 'religious', 'police', 'admin_chores' ] # for row, col in self.df.iterrows(): # if col['category_label'] not in lb: # self.df = self.df.drop(row) if not binary: self.labels = { 'shopping': 3, 'health': 5, 'home': 10, 'work': 9, 'entertainment': 1, 'commuting': 0, 'recreation': 7, 'education': 2, 'eating': 4, 'travel': 6, 'admin_chores': 8, 'police': 12, 'religious': 11 } else: self.df = self.df.apply(self.binarization_apply, axis=1) self.labels = { 'systematic (home,work,education)': 0, 'non-systematic': 1 } target = preprocessing.OneHotEncoder().fit_transform( self.df['category'].values.reshape(-1, 1)) df_train = self.df.drop(columns=['category', 'category_label']) self.ct = ColumnTransformer( [ ('oh', preprocessing.OneHotEncoder(), [ 'activity_time', 'mode', 'd_datetime', 'o_datetime', 'occupation', 'gender', 'bin_weekday', 'bin_category' ]), ( 'qt', preprocessing.QuantileTransformer( output_distribution='normal'), [ 'home', 'work', 'eating', 'entertainment', 'recreation', 'shopping', 'travel', 'admin_chores', 'religious', 'health', 'police', 'education', 'age', # 'P_TOT', # 'MALE_TOT','FEM_TOT', 'age 25-39', 'age 40-64', 'age >65', 'age 10-24', 'P47', 'P48', 'P49', 'P61', 'P62', # 'INCOME' ]), # ('mm', preprocessing.MinMaxScaler(), ['P61']) ], # remainder='passthrough' ) df_train = df_train.fillna(df_train.mean()) self.sc_fit = self.ct.fit(df_train) data = self.sc_fit.transform(df_train) return data, target
import matplotlib get_ipython().magic('matplotlib inline') model_data_l.cluster.value_counts().plot(kind = 'pie') # ### 4.2 k-means聚类的第二种方式:进行变量分布的正态转换--用于客户细分 # - 1、进行变量分布的正态转换 # In[19]: import numpy as np from sklearn import preprocessing quantile_transformer = preprocessing.QuantileTransformer(output_distribution='normal', random_state=0) fa_scores_trans=quantile_transformer.fit_transform(fa_scores) fa_scores_trans=pd.DataFrame(fa_scores_trans) fa_scores_trans=fa_scores_trans.rename(columns={0: "ATM_POS", 1: "TBM", 2: "CSC"}) fa_scores_trans.head() # In[20]: var = ["ATM_POS","TBM","CSC"] skew_var = {} for i in var: skew_var[i]=abs(fa_scores_trans[i].skew()) skew=pd.Series(skew_var).sort_values(ascending=False) skew