def getClassWeights(self, weightType, dataSet=None): if not weightType in [ "Freq", "MedianFreq", "1x", "2x", "division", "relativeToMin", "quantile" ]: raise ValueError( "Wrong weights calc type given! Valid arguments are [Freq, MedianFreq, 1x, 2x, division, relativeToMin, quantile]" ) # get class weights because of imbalanced dataset (e.g. a lot of road and buildings) print("Calculate class ", weightType, " weights...") # only calculate the weights from a specific split of the dataset. For performance reasons # PART = 1 would be the total dataset PART = 10 classCount = np.zeros(self.config["classes"]) # count all the classes in every given mask image for i in range(int(self.config["trainSize"] / PART)): labelImg = self.getImage(i, "trainLabel").flatten() labelClassCount = np.bincount(labelImg, minlength=self.config["classes"]) classCount += labelClassCount if i % int(1000 / PART) == 0: print("Label image ", i, "/", self.config["trainSize"] / PART) print("Class count: ", classCount.shape, classCount) #choose class weights type #Frequency if weightType == "Freq": classWeights = np.median(classCount) / classCount #Median Frequency elif weightType == "MedianFreq": classWeights = np.median(np.median(classCount) / classCount) / ( np.median(classCount) / classCount) # Simple Total/ClassCount elif weightType == "1x": classWeights = 1 - (classCount / classCount.sum() * 1) # Simple Total/ClassCount doubled effect elif weightType == "2x": classWeights = 1 - (classCount / classCount.sum() * 2) # Simple Total/ClassCount divided by Minimum elif weightType == "division": classWeights = classCount.sum() / classCount #divide with minimum classWeights[classWeights == 1] = 999999 classWeights /= classWeights.min() # all weights are relative to the smallest class which is assigned the 1.0. Minimal assigned value is 0.1 elif weightType == "relativeToMin": classWeights = classCount.min() / classCount print("Class weights: ", classWeights.shape, classWeights) classWeights[(classWeights < 0.1)] *= 10 # using the quantile transformer of sklearn all the weights are distributed in 0-0.9999. Minimal assigned value is 0.1 elif weightType == "quantile": from sklearn.preprocessing.data import QuantileTransformer _scaler = QuantileTransformer() classCount = np.expand_dims(classCount, axis=1) classWeights = _scaler.fit_transform(classCount) classWeights = np.around(classWeights, decimals=4) classWeights = np.squeeze(classWeights) classWeights = 1 - classWeights classWeights[(classWeights < 0.1)] = 0.1 else: raise ValueError( "Wrong weights calc type given! Valid arguments are [Freq, MedianFreq, 1x, 2x, division, relativeToMin, quantile]" ) # eliminate inf values classWeights[(classWeights == np.inf)] = 1 print("Class weights: ", classWeights.shape, classWeights) np.save( "classWeights" + str(self.config["x"]) + str(self.config["y"]) + self.config["name"], classWeights)
class FeatureMap(object): def __init__(self, df): self.df = copy.deepcopy(df) self.onehot = None self.label_code = None self.col_label_dict = dict() self.min_max_scale = None self.max_abs_scale = None self.standard_scale = None self.robust_scale = None self.quantile_transform = None def log_map(self, col_need, col_replace=True): df_need = self.df[col_need] if col_replace: self.df[col_need] = df_need.apply(lambda x: np.log(x)) else: col_need_extend = [col + "_log" for col in col_need] self.df[col_need_extend] = df_need.apply(lambda x: np.log(x)) def box_cox_map(self, col_need, gamma=1.0, col_replace=True): """ y = ((1+x)**gamma - 1) / gamma if gamma != 0 log(1+x) if gamma == 0 ref: http://onlinestatbook.com/2/transformations/box-cox.html :param col_need: :param gamma: :param col_replace: :return: """ df_need = self.df[col_need] if col_replace: self.df[col_need] = df_need.applymap(lambda x: boxcox1p(x, gamma)) else: col_need_extend = [col + "_boxCox" for col in col_need] self.df[col_need_extend] = df_need.applymap( lambda x: boxcox1p(x, gamma)) def onehot_encode(self, col_need, start_zero=True): """ onehot encode DataFrame of which the columns you need note: the origin category should be integer in range(classes) or range(classes+1) :param col_need: :param start_zero: category is in range(classes) :return: new DataFrame without col_need, after onehot encoding, start method is in accordance with start_zero """ self.onehot = OneHotEncoder(sparse=False) array_onehot = self.onehot.fit_transform(self.df.loc[:, col_need]) col_onehot = [] for col_index in range(len(col_need)): if start_zero: for hot_index in range(self.onehot.n_values_[col_index]): col_onehot.append(col_need[col_index] + str(hot_index)) else: for hot_index in range(1, self.onehot.n_values_[col_index]): col_onehot.append(col_need[col_index] + str(hot_index)) self.df.drop(col_need, axis=1, inplace=True) df_onehot = pd.DataFrame(array_onehot, columns=col_onehot, index=self.df.index) self.df = pd.concat([self.df, df_onehot], axis=1) def label_encode(self, col_need): """ onehot encode DataFrame of which the columns you need :param col_need: length should be 1 :return: new DataFrame without col_need, after label encoding, start from 0 """ assert isinstance(col_need, list) and len(col_need) == 1 self.label_code = LabelEncoder() array_label_code = self.label_code.fit_transform(self.df.loc[:, col_need]) label_list = list(self.label_code.classes_) for i, x in enumerate(label_list): self.col_label_dict[col_need[0] + "_" + str(i)] = col_need[0] + "_" + x self.df.drop(col_need, axis=1, inplace=True) df_label_code = pd.DataFrame(array_label_code, columns=col_need, index=self.df.index) self.df = pd.concat([self.df, df_label_code], axis=1) def standard_scale_map(self, col_need, drop_origin_col=False): self.standard_scale = StandardScaler() array_standard = self.standard_scale.fit_transform( self.df.loc[:, col_need]) self._scale_map(array=array_standard, column_name=col_need, suffix="_stdScale", drop_origin_columns=drop_origin_col) def min_max_scale_map(self, col_need, drop_origin_col=False): self.min_max_scale = MinMaxScaler() array_min_max = self.min_max_scale.fit_transform(self.df.loc[:, col_need]) self._scale_map(array=array_min_max, column_name=col_need, suffix="_minMaxScale", drop_origin_columns=drop_origin_col) def max_abs_scale_map(self, col_need, drop_origin_col=False): self.max_abs_scale = MaxAbsScaler() array_max_abs = self.max_abs_scale.fit_transform(self.df.loc[:, col_need]) self._scale_map(array=array_max_abs, column_name=col_need, suffix="_maxAbsScale", drop_origin_columns=drop_origin_col) def robust_scale_map(self, col_need, quantile_range=(25, 75), drop_origin_col=False): """ This Scaler removes the median and scales the data according to the quantile range (defaults to IQR: Interquartile Range). The IQR is the range between the 1st quartile (25th quantile) and the 3rd quartile (75th quantile). :param col_need: :param quantile_range: :param drop_origin_col: :return: """ self.robust_scale = RobustScaler(quantile_range=quantile_range) array_robust = self.robust_scale.fit_transform(self.df.loc[:, col_need]) self._scale_map(array=array_robust, column_name=col_need, suffix="_robust_scale", drop_origin_columns=drop_origin_col) def quantile_scale_map(self, col_need, distribution='uniform', drop_origin_col=False): """ :param col_need: :param distribution: 'uniform' (default) or 'normal' :param drop_origin_col: :return: """ self.quantile_transform = QuantileTransformer( output_distribution=distribution) array_quantile = self.quantile_transform.fit_transform( self.df.loc[:, col_need]) self._scale_map(array=array_quantile, column_name=col_need, suffix="_q{}Map".format(distribution.capitalize()), drop_origin_columns=drop_origin_col) def _scale_map(self, array, column_name, suffix, drop_origin_columns=False): if drop_origin_columns: self.df.drop(column_name, axis=1, inplace=True) col = [col + suffix for col in column_name] df_scale = pd.DataFrame(array, columns=col, index=self.df.index) self.df = pd.concat([self.df, df_scale], axis=1) def quantile_floor_map(self, col_need, floor_num=5, drop_origin_col=False): """ after quantile_scale_map when distribution='uniform', value is scaled in [0, 1] for tree models, onehot encoding is need :param col_need: :param floor_num: uniform floor map :param drop_origin_col :return: """ bool0 = (self.df.loc[:, col_need] >= 0) & (self.df.loc[:, col_need] <= 1) assert bool0.all().all() col_suffix = np.array([x.endswith("_qUniformMap") for x in col_need]) assert np.prod(col_suffix) array_quantile_floor = (self.df.loc[:, col_need].values * floor_num).astype(np.int) self._scale_map(array=array_quantile_floor, column_name=col_need, suffix="_qFloorMap", drop_origin_columns=drop_origin_col)
df['BS1' + str(i1)] = df1['ans1'].str[i1] # remove original style name now df.drop(['Style Name'], 1, inplace=True) df = df.dropna(axis=0) c = df.columns[df.dtypes.eq(object)] df[c] = df[c].apply(pd.to_numeric, errors='coerce', axis=0) scaler = QuantileTransformer() #df3 = scaler.fit_transform(df) X5 = np.array(df.drop(['Score'], 1)) y5 = np.array(df['Score']) X3 = scaler.fit_transform(pd.DataFrame(X5)) #y3 = scaler.fit_transform(pd.DataFrame(y5)) y3 = 1.2 - np.log(y5) X_train, X_test, y_train, y_test = cross_validation.train_test_split( X5, y5, test_size=0.20) # Data Munging Compleated.###################### from keras import layers, models def build_model(): model = models.Sequential() # 32 - Next layer depth, no of layers that should be present in the next layer, each layer acts as unique filter.
# Author: Franz Weidmann # Info: Creates for each hosts an SVM one class classifier to retrieve the normal # state of the host. All models will be trained and save into a npy file import numpy as np from sklearn import svm from sklearn.externals import joblib from sklearn.preprocessing.data import QuantileTransformer trainData = np.load("../../data/data.npy") # transform data for scaling and save the state of the transformer for each host scalers = [] for h in range(trainData.shape[0]): _scaler = QuantileTransformer() trainData[h] = _scaler.fit_transform(trainData[h]) scalers.append(_scaler) # train and svm one class classifier for every host models = [] for modelIndex in range(trainData.shape[0]): print("Creating model ", modelIndex) model = svm.OneClassSVM(kernel="rbf", verbose=True) model.fit(trainData[modelIndex]) models.append(model) print("Trained model ", modelIndex) joblib.dump([scalers, models], "models.pkl")