class QuantileTransformerImpl(): def __init__(self, n_quantiles=1000, output_distribution='uniform', ignore_implicit_zeros=False, subsample=100000, random_state=None, copy=True): self._hyperparams = { 'n_quantiles': n_quantiles, 'output_distribution': output_distribution, 'ignore_implicit_zeros': ignore_implicit_zeros, 'subsample': subsample, 'random_state': random_state, 'copy': copy } def fit(self, X, y=None): self._sklearn_model = SKLModel(**self._hyperparams) if (y is not None): self._sklearn_model.fit(X, y) else: self._sklearn_model.fit(X) return self def transform(self, X): return self._sklearn_model.transform(X)
def fit_model(model, scaler='quantileNormal'): #Usar todos os dados para garantir a representacao de todos os genres X, y, genre_mapping = loadFeatures_NoSplit() if scaler=='standard': norm = StandardScaler() elif scaler=='minMax': norm = MinMaxScaler() elif scaler=='maxAbs': norm = MaxAbsScaler() elif scaler=='robust': norm = RobustScaler(quantile_range=(25, 75)) elif scaler=='quantileUniform': norm = QuantileTransformer(output_distribution='uniform') elif scaler=='quantileNormal': norm = QuantileTransformer(output_distribution='normal') elif scaler=='L2Norm': norm = Normalizer() if model == 'RF': classifier = RandomForestClassifier(n_estimators=100, criterion='entropy', max_features=1, min_samples_leaf=6, min_samples_split=5) elif model == 'SVM': classifier = SVC(kernel='linear', gamma=0.15043359874882642, C=37.04299991666622) elif model == 'LR': classifier = LogisticRegression(solver='lbfgs', penalty='l2', C=12.860378815138466, multi_class='multinomial', warm_start=True) elif model == 'kNN': classifier = KNeighborsClassifier(algorithm='ball_tree', leaf_size=31, n_neighbors=29) elif model == 'MLP': classifier = MLPClassifier(activation='relu', alpha=0.05440165708835292, batch_size=358, solver='adam') elif model == 'SGD': classifier = SGDClassifier(penalty='elasticnet', loss='log', alpha=0.02011798244191186, max_iter=50) elif model == 'extraTrees': classifier = ExtraTreesClassifier(n_estimators=50, criterion='gini', max_features=3, min_samples_leaf=5, min_samples_split=8) else: raise ValueError('Classifier not recognized {}.'.format(model)) #Usar e guardar o pipeline para aplicar os mesmos passos aos novos dados de teste pipe = Pipeline([('normalize', norm), ('reduce_dim', LDA()),('classify', classifier)]) pipe.fit(X, y) #Previsoes com dados novos # score = pipe.score(X,y) # features = pd.read_csv('sampleFeatures.csv', index_col=0, header=[0, 1, 2]) # X = features.iloc[:, 0:518].values # predict = pipe.predict(X) # print("Score is %s and Prediction is %s with model %s" % (score, predict, model)) joblib.dump(pipe, 'final_model.pkl') result = "%s Model using %s scaler" % (model, scaler) return result
def fit(self, X, y=None): self._sklearn_model = SKLModel(**self._hyperparams) if (y is not None): self._sklearn_model.fit(X, y) else: self._sklearn_model.fit(X) return self
def __init__(self, n_quantiles=1000, output_distribution='uniform', ignore_implicit_zeros=False, subsample=100000, random_state=None, copy=True): self._hyperparams = { 'n_quantiles': n_quantiles, 'output_distribution': output_distribution, 'ignore_implicit_zeros': ignore_implicit_zeros, 'subsample': subsample, 'random_state': random_state, 'copy': copy} self._wrapped_model = Op(**self._hyperparams)
def normalize(X, scaler='quantileNormal'): if scaler=='standard': X = StandardScaler().fit_transform(X) elif scaler=='minMax': X = MinMaxScaler().fit_transform(X) elif scaler=='maxAbs': X = MaxAbsScaler().fit_transform(X) elif scaler=='robust': X = RobustScaler(quantile_range=(25, 75)).fit_transform(X) elif scaler=='quantileUniform': X = QuantileTransformer(output_distribution='uniform').fit_transform(X) elif scaler=='quantileNormal': X = QuantileTransformer(output_distribution='normal').fit_transform(X) elif scaler=='L2Norm': X = Normalizer().fit_transform(X) else: raise ValueError('Scaler is not defined: %s' % scaler) return X
def quantile_scale_map(self, col_need, distribution='uniform', drop_origin_col=False): """ :param col_need: :param distribution: 'uniform' (default) or 'normal' :param drop_origin_col: :return: """ self.quantile_transform = QuantileTransformer( output_distribution=distribution) array_quantile = self.quantile_transform.fit_transform( self.df.loc[:, col_need]) self._scale_map(array=array_quantile, column_name=col_need, suffix="_q{}Map".format(distribution.capitalize()), drop_origin_columns=drop_origin_col)
def main(): X, y = load_wine() distributions = [ #('Unscaled data', X), #('Data after standard scaling', StandardScaler().fit_transform(X)), #('Data after min-max scaling', MinMaxScaler().fit_transform(X)), #('Data after max-abs scaling', MaxAbsScaler().fit_transform(X)), #('Data after robust scaling', RobustScaler(quantile_range=(25, 75)).fit_transform(X)), ('Data after quantile transformation (uniform pdf)', QuantileTransformer(output_distribution='uniform').fit_transform(X)), #('Data after quantile transformation (gaussian pdf)', QuantileTransformer(output_distribution='normal').fit_transform(X)), #('Data after sample-wise L2 normalizing', Normalizer().fit_transform(X)) ## this one sucked ] for label, scaled_X in distributions: print '========================' print label print '========================' scaled_X_df = pd.DataFrame(scaled_X, index=X.index, columns=X.columns) machine_learn(scaled_X_df, y) print '========================' print '' make_plots(scores)
def getClassWeights(self, weightType, dataSet=None): if not weightType in [ "Freq", "MedianFreq", "1x", "2x", "division", "relativeToMin", "quantile" ]: raise ValueError( "Wrong weights calc type given! Valid arguments are [Freq, MedianFreq, 1x, 2x, division, relativeToMin, quantile]" ) # get class weights because of imbalanced dataset (e.g. a lot of road and buildings) print("Calculate class ", weightType, " weights...") # only calculate the weights from a specific split of the dataset. For performance reasons # PART = 1 would be the total dataset PART = 10 classCount = np.zeros(self.config["classes"]) # count all the classes in every given mask image for i in range(int(self.config["trainSize"] / PART)): labelImg = self.getImage(i, "trainLabel").flatten() labelClassCount = np.bincount(labelImg, minlength=self.config["classes"]) classCount += labelClassCount if i % int(1000 / PART) == 0: print("Label image ", i, "/", self.config["trainSize"] / PART) print("Class count: ", classCount.shape, classCount) #choose class weights type #Frequency if weightType == "Freq": classWeights = np.median(classCount) / classCount #Median Frequency elif weightType == "MedianFreq": classWeights = np.median(np.median(classCount) / classCount) / ( np.median(classCount) / classCount) # Simple Total/ClassCount elif weightType == "1x": classWeights = 1 - (classCount / classCount.sum() * 1) # Simple Total/ClassCount doubled effect elif weightType == "2x": classWeights = 1 - (classCount / classCount.sum() * 2) # Simple Total/ClassCount divided by Minimum elif weightType == "division": classWeights = classCount.sum() / classCount #divide with minimum classWeights[classWeights == 1] = 999999 classWeights /= classWeights.min() # all weights are relative to the smallest class which is assigned the 1.0. Minimal assigned value is 0.1 elif weightType == "relativeToMin": classWeights = classCount.min() / classCount print("Class weights: ", classWeights.shape, classWeights) classWeights[(classWeights < 0.1)] *= 10 # using the quantile transformer of sklearn all the weights are distributed in 0-0.9999. Minimal assigned value is 0.1 elif weightType == "quantile": from sklearn.preprocessing.data import QuantileTransformer _scaler = QuantileTransformer() classCount = np.expand_dims(classCount, axis=1) classWeights = _scaler.fit_transform(classCount) classWeights = np.around(classWeights, decimals=4) classWeights = np.squeeze(classWeights) classWeights = 1 - classWeights classWeights[(classWeights < 0.1)] = 0.1 else: raise ValueError( "Wrong weights calc type given! Valid arguments are [Freq, MedianFreq, 1x, 2x, division, relativeToMin, quantile]" ) # eliminate inf values classWeights[(classWeights == np.inf)] = 1 print("Class weights: ", classWeights.shape, classWeights) np.save( "classWeights" + str(self.config["x"]) + str(self.config["y"]) + self.config["name"], classWeights)
# Feature 5 has a few but very large outliers. X = X_full[:, [0, 5]] distributions = [ ('Unscaled data', X), ('Data after standard scaling', StandardScaler().fit_transform(X)), ('Data after min-max scaling', MinMaxScaler().fit_transform(X)), ('Data after max-abs scaling', MaxAbsScaler().fit_transform(X)), ('Data after robust scaling', RobustScaler(quantile_range=(25, 75)).fit_transform(X)), ('Data after quantile transformation (uniform pdf)', QuantileTransformer(output_distribution='uniform') .fit_transform(X)), ('Data after quantile transformation (gaussian pdf)', QuantileTransformer(output_distribution='normal') .fit_transform(X)), ('Data after sample-wise L2 normalizing', Normalizer().fit_transform(X)) ] # scale the output between 0 and 1 for the colorbar y = minmax_scale(y_full) def create_axes(title, figsize=(16, 6)): fig = plt.figure(figsize=figsize) fig.suptitle(title)
for i1 in y1: a1 = np.binary_repr(i1, 7) ans1.append(a1) df1['ans1'] = pd.DataFrame(ans1) for i1 in range(7): df['BS1' + str(i1)] = df1['ans1'].str[i1] # remove original style name now df.drop(['Style Name'], 1, inplace=True) df = df.dropna(axis=0) c = df.columns[df.dtypes.eq(object)] df[c] = df[c].apply(pd.to_numeric, errors='coerce', axis=0) scaler = QuantileTransformer() #df3 = scaler.fit_transform(df) X5 = np.array(df.drop(['Score'], 1)) y5 = np.array(df['Score']) X3 = scaler.fit_transform(pd.DataFrame(X5)) #y3 = scaler.fit_transform(pd.DataFrame(y5)) y3 = 1.2 - np.log(y5) X_train, X_test, y_train, y_test = cross_validation.train_test_split( X5, y5, test_size=0.20) # Data Munging Compleated.###################### from keras import layers, models
class FeatureMap(object): def __init__(self, df): self.df = copy.deepcopy(df) self.onehot = None self.label_code = None self.col_label_dict = dict() self.min_max_scale = None self.max_abs_scale = None self.standard_scale = None self.robust_scale = None self.quantile_transform = None def log_map(self, col_need, col_replace=True): df_need = self.df[col_need] if col_replace: self.df[col_need] = df_need.apply(lambda x: np.log(x)) else: col_need_extend = [col + "_log" for col in col_need] self.df[col_need_extend] = df_need.apply(lambda x: np.log(x)) def box_cox_map(self, col_need, gamma=1.0, col_replace=True): """ y = ((1+x)**gamma - 1) / gamma if gamma != 0 log(1+x) if gamma == 0 ref: http://onlinestatbook.com/2/transformations/box-cox.html :param col_need: :param gamma: :param col_replace: :return: """ df_need = self.df[col_need] if col_replace: self.df[col_need] = df_need.applymap(lambda x: boxcox1p(x, gamma)) else: col_need_extend = [col + "_boxCox" for col in col_need] self.df[col_need_extend] = df_need.applymap( lambda x: boxcox1p(x, gamma)) def onehot_encode(self, col_need, start_zero=True): """ onehot encode DataFrame of which the columns you need note: the origin category should be integer in range(classes) or range(classes+1) :param col_need: :param start_zero: category is in range(classes) :return: new DataFrame without col_need, after onehot encoding, start method is in accordance with start_zero """ self.onehot = OneHotEncoder(sparse=False) array_onehot = self.onehot.fit_transform(self.df.loc[:, col_need]) col_onehot = [] for col_index in range(len(col_need)): if start_zero: for hot_index in range(self.onehot.n_values_[col_index]): col_onehot.append(col_need[col_index] + str(hot_index)) else: for hot_index in range(1, self.onehot.n_values_[col_index]): col_onehot.append(col_need[col_index] + str(hot_index)) self.df.drop(col_need, axis=1, inplace=True) df_onehot = pd.DataFrame(array_onehot, columns=col_onehot, index=self.df.index) self.df = pd.concat([self.df, df_onehot], axis=1) def label_encode(self, col_need): """ onehot encode DataFrame of which the columns you need :param col_need: length should be 1 :return: new DataFrame without col_need, after label encoding, start from 0 """ assert isinstance(col_need, list) and len(col_need) == 1 self.label_code = LabelEncoder() array_label_code = self.label_code.fit_transform(self.df.loc[:, col_need]) label_list = list(self.label_code.classes_) for i, x in enumerate(label_list): self.col_label_dict[col_need[0] + "_" + str(i)] = col_need[0] + "_" + x self.df.drop(col_need, axis=1, inplace=True) df_label_code = pd.DataFrame(array_label_code, columns=col_need, index=self.df.index) self.df = pd.concat([self.df, df_label_code], axis=1) def standard_scale_map(self, col_need, drop_origin_col=False): self.standard_scale = StandardScaler() array_standard = self.standard_scale.fit_transform( self.df.loc[:, col_need]) self._scale_map(array=array_standard, column_name=col_need, suffix="_stdScale", drop_origin_columns=drop_origin_col) def min_max_scale_map(self, col_need, drop_origin_col=False): self.min_max_scale = MinMaxScaler() array_min_max = self.min_max_scale.fit_transform(self.df.loc[:, col_need]) self._scale_map(array=array_min_max, column_name=col_need, suffix="_minMaxScale", drop_origin_columns=drop_origin_col) def max_abs_scale_map(self, col_need, drop_origin_col=False): self.max_abs_scale = MaxAbsScaler() array_max_abs = self.max_abs_scale.fit_transform(self.df.loc[:, col_need]) self._scale_map(array=array_max_abs, column_name=col_need, suffix="_maxAbsScale", drop_origin_columns=drop_origin_col) def robust_scale_map(self, col_need, quantile_range=(25, 75), drop_origin_col=False): """ This Scaler removes the median and scales the data according to the quantile range (defaults to IQR: Interquartile Range). The IQR is the range between the 1st quartile (25th quantile) and the 3rd quartile (75th quantile). :param col_need: :param quantile_range: :param drop_origin_col: :return: """ self.robust_scale = RobustScaler(quantile_range=quantile_range) array_robust = self.robust_scale.fit_transform(self.df.loc[:, col_need]) self._scale_map(array=array_robust, column_name=col_need, suffix="_robust_scale", drop_origin_columns=drop_origin_col) def quantile_scale_map(self, col_need, distribution='uniform', drop_origin_col=False): """ :param col_need: :param distribution: 'uniform' (default) or 'normal' :param drop_origin_col: :return: """ self.quantile_transform = QuantileTransformer( output_distribution=distribution) array_quantile = self.quantile_transform.fit_transform( self.df.loc[:, col_need]) self._scale_map(array=array_quantile, column_name=col_need, suffix="_q{}Map".format(distribution.capitalize()), drop_origin_columns=drop_origin_col) def _scale_map(self, array, column_name, suffix, drop_origin_columns=False): if drop_origin_columns: self.df.drop(column_name, axis=1, inplace=True) col = [col + suffix for col in column_name] df_scale = pd.DataFrame(array, columns=col, index=self.df.index) self.df = pd.concat([self.df, df_scale], axis=1) def quantile_floor_map(self, col_need, floor_num=5, drop_origin_col=False): """ after quantile_scale_map when distribution='uniform', value is scaled in [0, 1] for tree models, onehot encoding is need :param col_need: :param floor_num: uniform floor map :param drop_origin_col :return: """ bool0 = (self.df.loc[:, col_need] >= 0) & (self.df.loc[:, col_need] <= 1) assert bool0.all().all() col_suffix = np.array([x.endswith("_qUniformMap") for x in col_need]) assert np.prod(col_suffix) array_quantile_floor = (self.df.loc[:, col_need].values * floor_num).astype(np.int) self._scale_map(array=array_quantile_floor, column_name=col_need, suffix="_qFloorMap", drop_origin_columns=drop_origin_col)
# on the distribution of values being transformed # Documentation at https://seaborn.pydata.org/generated/seaborn.distplot.html #this is the list of transformations to perform on the data distributions = [ ('Unscaled data', X, 'Unscaled'), ('Data after standard scaling', StandardScaler().fit_transform(X), 'standard'), ('Data after min-max scaling', MinMaxScaler().fit_transform(X), 'min_max'), ('Data after max-abs scaling', MaxAbsScaler().fit_transform(X),'max_abs'), ('Data after robust scaling', RobustScaler(quantile_range=(25, 75)).fit_transform(X),'robust'), ('Data after quantile transformation (uniform pdf)', QuantileTransformer(output_distribution='uniform') .fit_transform(X),'quantile'), ('Data after quantile transformation (gaussian pdf)', QuantileTransformer(output_distribution='normal') .fit_transform(X),'gaussian'), ('Data after sample-wise L2 normalizing', Normalizer().fit_transform(X),'normalizing'), ('Data after Log + 1', np.log(X+1), 'Log + 1'), ] #define function to produce the graphic def create_plts(x): temp = distributions[x][2] temp, ax = plt.subplots() sns.distplot(distributions[x][1]).set_title(distributions[x][0]) temp.savefig(cols + distributions[x][0] + '.pdf', bbox_inches = 'tight', dpi=None, facecolor='w', edgecolor='b',
def __init__(self, nQuantile=None): self.nQuantile = Separator_num.nQUANTILE if nQuantile is None else nQuantile Pipeline.__init__(self, [ ('geometry' , Separator_num.Selector()) , ('quantiled', QuantileTransformer(n_quantiles=self.nQuantile, copy=False)) #use in-place scaling ])
return x Scalers = { 'Unscaled data': LinearScaler(), 'Standard scaling': StandardScaler(), 'Min-max scaling': MinMaxScaler(), 'Max-abs scaling': MaxAbsScaler(), 'Robust scaling': RobustScaler(quantile_range=(25, 75)), 'Quantile transformation (uniform pdf)': QuantileTransformer(output_distribution='uniform'), 'Quantile transformation (gaussian pdf)': QuantileTransformer(output_distribution='normal'), 'Sample-wise L2 normalizing': Normalizer(norm='l2'), 'Sample-wise L1 normalizing': Normalizer(norm='l1'), } def create_axes(title, figsize=(16, 6)): fig = plt.figure(figsize=figsize) fig.suptitle(title) # define the axis for the first plot left, width = 0.1, 0.22
# Author: Franz Weidmann # Info: Creates for each hosts an SVM one class classifier to retrieve the normal # state of the host. All models will be trained and save into a npy file import numpy as np from sklearn import svm from sklearn.externals import joblib from sklearn.preprocessing.data import QuantileTransformer trainData = np.load("../../data/data.npy") # transform data for scaling and save the state of the transformer for each host scalers = [] for h in range(trainData.shape[0]): _scaler = QuantileTransformer() trainData[h] = _scaler.fit_transform(trainData[h]) scalers.append(_scaler) # train and svm one class classifier for every host models = [] for modelIndex in range(trainData.shape[0]): print("Creating model ", modelIndex) model = svm.OneClassSVM(kernel="rbf", verbose=True) model.fit(trainData[modelIndex]) models.append(model) print("Trained model ", modelIndex) joblib.dump([scalers, models], "models.pkl")